From e874bf16663a30cd50c168d244211a4ab9c5b97c Mon Sep 17 00:00:00 2001
From: Copilot <198982749+Copilot@users.noreply.github.com>
Date: Wed, 22 Apr 2026 10:12:40 -0700
Subject: [PATCH 01/12] fix: isCuMemMapAllocated crashes on non-NVLS systems
 even with MSCCLPP_FORCE_DISABLE_NVLS=true (#790)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- [x] Fix `isCuMemMapAllocated()` to just return `true/false` without
throwing when NVLS is not supported
- [x] Fix `isNvlsSupported()` caching bug where `result`/`isChecked`
were never updated
- [x] Restore `[[maybe_unused]]` on `result` and `isChecked` statics —
needed in HIP/ROCm env where `CUDA_NVLS_API_AVAILABLE` is not defined
and the variables would otherwise be unused
- [x] Run linter (`./tools/lint.sh`)

---------

Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
Co-authored-by: Binyang2014 <9415966+Binyang2014@users.noreply.github.com>
---
 src/core/gpu_utils.cc | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/src/core/gpu_utils.cc b/src/core/gpu_utils.cc
index 628d2dcb..09d5025d 100644
--- a/src/core/gpu_utils.cc
+++ b/src/core/gpu_utils.cc
@@ -283,7 +283,9 @@ bool isNvlsSupported() {
     MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId));
     MSCCLPP_CUTHROW(cuDeviceGet(&dev, deviceId));
     MSCCLPP_CUTHROW(cuDeviceGetAttribute(&isMulticastSupported, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, dev));
-    return isMulticastSupported == 1;
+    result = (isMulticastSupported == 1);
+    isChecked = true;
+    return result;
   }
   return result;
 #endif
@@ -300,9 +302,6 @@ bool isCuMemMapAllocated([[maybe_unused]] void* ptr) {
     return false;
   }
   MSCCLPP_CUTHROW(cuMemRelease(handle));
-  if (!isNvlsSupported()) {
-    throw Error("cuMemMap is used in env without NVLS support", ErrorCode::InvalidUsage);
-  }
   return true;
 #endif
 }

From c97be492d5d097f2ab5885e74b029610845400a4 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 27 Apr 2026 10:32:20 -0700
Subject: [PATCH 02/12] GDRCopy status message to string (#793)

---
 src/core/gdr.cc          | 4 ++--
 src/core/include/gdr.hpp | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/core/gdr.cc b/src/core/gdr.cc
index 22ac15c9..f361a3aa 100644
--- a/src/core/gdr.cc
+++ b/src/core/gdr.cc
@@ -48,7 +48,7 @@ GdrStatus gdrStatus() { return gdrContext()->status(); }
 
 bool gdrEnabled() { return gdrStatus() == GdrStatus::Ok; }
 
-const char* gdrStatusMessage() {
+std::string gdrStatusMessage() {
   switch (gdrStatus()) {
     case GdrStatus::Ok:
       return "GDRCopy initialized successfully";
@@ -181,7 +181,7 @@ GdrStatus gdrStatus() { return GdrStatus::NotBuilt; }
 
 bool gdrEnabled() { return false; }
 
-const char* gdrStatusMessage() { return "mscclpp was not built with GDRCopy support (MSCCLPP_USE_GDRCOPY not set)"; }
+std::string gdrStatusMessage() { return "mscclpp was not built with GDRCopy support (MSCCLPP_USE_GDRCOPY not set)"; }
 
 // GdrMap::Impl — stub (no GDRCopy)
 
diff --git a/src/core/include/gdr.hpp b/src/core/include/gdr.hpp
index e0c7f006..c1378334 100644
--- a/src/core/include/gdr.hpp
+++ b/src/core/include/gdr.hpp
@@ -7,6 +7,7 @@
 #include <cstddef>
 #include <cstdint>
 #include <memory>
+#include <string>
 
 namespace mscclpp {
 
@@ -25,7 +26,7 @@ GdrStatus gdrStatus();
 bool gdrEnabled();
 
 /// Return a human-readable error message for the current GDRCopy status.
-const char* gdrStatusMessage();
+std::string gdrStatusMessage();
 
 /// RAII wrapper for a GDRCopy BAR1 mapping of a GPU address.
 /// When GDRCopy is not available, all operations are no-ops and valid() returns false.

From 2c52937b26e6b72846cb8bec2f7479fb90162913 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Tue, 28 Apr 2026 15:02:22 -0700
Subject: [PATCH 03/12] Fix FP8 ROCm build/test issues and dtype naming (#792)

## Summary
- Fix ROCm FP8 build failure by using the actual FP8 `DataType` enum
constants in allreduce packet tuning.
- Fix FP8 E4M3FNUZ test encoding so small negative values do not produce
the FNUZ NaN byte (`0x80`).
- Align FP8 `DataType` enum constants and Python bindings with
torch-style names (`FLOAT8_E4M3FN`, `FLOAT8_E4M3FNUZ`, `FLOAT8_E5M2FNUZ`
/ `float8_e4m3fn`, `float8_e4m3fnuz`, `float8_e5m2fnuz`).

## Validation
- `./tools/lint.sh`
- `make -j` from `build/`
- `mpirun --allow-run-as-root -np 8 python3 -m pytest
python/test/test_fp8_accum.py -q` (`36 passed, 9 skipped`)
- `DTYPE=float8_e4m3fnuz ACCUM_DTYPE=float32 torchrun --nnodes=1
--nproc_per_node=8
examples/torch-integration/customized_comm_with_tuning.py`

---------

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 include/mscclpp/gpu_data_types.hpp            | 104 ++++--------
 python/csrc/core_py.cpp                       |   6 +-
 python/mscclpp/utils.py                       |  12 +-
 python/test/test_fp8_accum.py                 | 152 +++++++++++++-----
 src/core/algorithm.cc                         |  35 +++-
 src/core/executor/execution_kernel.cu         |   4 +-
 src/core/executor/executor.cc                 |   4 +-
 src/core/include/execution_kernel.hpp         |  27 +++-
 .../collectives/allreduce/allreduce_packet.cu |   5 +-
 .../collectives/include/allreduce/common.hpp  |  14 +-
 src/ext/nccl/algorithm_selector.cc            |   3 +-
 src/ext/nccl/datatype_conversion.hpp          |  43 ++++-
 12 files changed, 271 insertions(+), 138 deletions(-)

diff --git a/include/mscclpp/gpu_data_types.hpp b/include/mscclpp/gpu_data_types.hpp
index 41bd5928..672434f9 100644
--- a/include/mscclpp/gpu_data_types.hpp
+++ b/include/mscclpp/gpu_data_types.hpp
@@ -21,7 +21,10 @@ using __bfloat162 = __hip_bfloat162;
 #if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR >= 6)
 #include <hip/hip_fp8.h>
 
-// Create aliases matching CUDA naming convention for cross-platform compatibility
+// Create aliases matching CUDA naming convention for cross-platform compatibility.
+// Define __FP8_E4M3_IS_FNUZ__ / __FP8_E5M2_IS_FNUZ__ when the platform-native FP8 is the
+// "fnuz" variant (no infinities, NaN-only at 0x80, bias differs from OCP). Dispatch layers
+// use these macros to throw on unsupported variants requested via DataType.
 #if (HIP_VERSION_MAJOR == 6) || (HIP_VERSION_MAJOR > 6 && HIP_FP8_TYPE_FNUZ && !HIP_FP8_TYPE_OCP)
 using __fp8_e4m3 = __hip_fp8_e4m3_fnuz;
 using __fp8_e5m2 = __hip_fp8_e5m2_fnuz;
@@ -29,6 +32,8 @@ using __fp8x2_e4m3 = __hip_fp8x2_e4m3_fnuz;
 using __fp8x2_e5m2 = __hip_fp8x2_e5m2_fnuz;
 using __fp8x4_e4m3 = __hip_fp8x4_e4m3_fnuz;
 using __fp8x4_e5m2 = __hip_fp8x4_e5m2_fnuz;
+#define __FP8_E4M3_IS_FNUZ__
+#define __FP8_E5M2_IS_FNUZ__
 #else
 using __fp8_e4m3 = __hip_fp8_e4m3;
 using __fp8_e5m2 = __hip_fp8_e5m2;
@@ -66,8 +71,8 @@ using __bfloat162 = __nv_bfloat162;
 
 /// Software float8 with 4 exponent bits, 3 mantissa bits, exponent bias = 15.
 /// Format (MSB first): [sign:1][exponent:4][mantissa:3]
-/// No infinities; exp=15 is NaN. Negative zero is NaN (fnuz convention).
-/// Max finite value: 0.9375, min normal: ~6.1e-5, min subnormal: ~7.6e-6.
+/// No infinities, no NaN. Encode saturates to ±1.75 (0x7e/0xfe).
+/// Adapted from the Triton compiler's fp8e4b15 format.
 struct alignas(1) __fp8_e4m3b15 {
   uint8_t __x;
 
@@ -97,35 +102,15 @@ struct alignas(1) __fp8_e4m3b15 {
   /// Algorithm: reinterpret fp8 bits into an fp16 bit pattern with exponent shifted by -8,
   /// then convert fp16 → float32.
   static MSCCLPP_HOST_DEVICE_INLINE float toFloat(uint8_t bits) {
-    // Handle special values: negative zero (0x80) → NaN, exponent=15 → NaN.
-    uint32_t exp = (bits >> 3) & 0xFu;
-    if (bits == 0x80 || exp == 15) {
-      union {
-        uint32_t u;
-        float f;
-      } nan_val = {0x7FC00000u};
-      return nan_val.f;
-    }
-    if (bits == 0) return 0.0f;
-
-    // Triton-style bit manipulation: fp8 → fp16 → fp32.
-    // fp8 layout: [S:1][E:4][M:3]  (bias=15)
-    // fp16 layout: [S:1][E:5][M:10] (bias=15)
-    //
-    // Place fp8 in upper byte of fp16, then right-shift exponent+mantissa by 1
-    // to convert E4 → E5 (both share bias=15). Sign bit stays at bit 15.
+    // Branch-free decode: fp8 → fp16 → fp32, no special-case handling.
+    // Encode saturates to ±1.75, so 0x7f/0xff are never produced.
     // Refer:
     // https://github.com/triton-lang/triton/blob/cf34004b8a67d290a962da166f5aa2fc66751326/python/triton/language/extra/cuda/utils.py#L34
     uint16_t h = (uint16_t)bits << 8;             // place fp8 in upper byte of fp16
     uint16_t sign16 = h & 0x8000u;                // extract sign at fp16 position
     uint16_t nosign = h & 0x7F00u;                // exponent + mantissa (no sign)
-    uint16_t fp16_bits = sign16 | (nosign >> 1);  // shift exponent right by 1
+    uint16_t fp16_bits = sign16 | (nosign >> 1);  // shift exponent right by 1 (E4→E5)
 
-    // For subnormals: when fp8 exponent=0, the above gives fp16 exponent=0
-    // and fp16 mantissa = (fp8_mantissa << 7), which correctly represents
-    // the subnormal fp16 value since both share bias=15.
-
-    // Convert fp16 bits to float via __half (works on host and device, CUDA and HIP).
     union {
       uint16_t u;
       __half h;
@@ -139,14 +124,6 @@ struct alignas(1) __fp8_e4m3b15 {
   /// The key insight is to convert to fp16 first (which shares bias=15 with e4m3b15),
   /// then pack the fp16 bits back into 8 bits by shifting the exponent left by 1.
   static MSCCLPP_HOST_DEVICE_INLINE uint8_t fromFloat(float val) {
-    union {
-      float f;
-      uint32_t u;
-    } in = {val};
-
-    // NaN → 0x80 (negative-zero bit pattern = NaN in fnuz).
-    if ((in.u & 0x7F800000u) == 0x7F800000u && (in.u & 0x007FFFFFu) != 0) return 0x80u;
-
     // Convert float32 → fp16 bits via __half (works on host and device, CUDA and HIP).
     __half h_val = __float2half_rn(val);
     union {
@@ -155,32 +132,19 @@ struct alignas(1) __fp8_e4m3b15 {
     } cvt = {h_val};
     uint16_t fp16_bits = cvt.u;
 
-    // Clamp absolute value to max finite e4m3b15: 0.9375 → fp16 = 0x3B80.
+    // Clamp abs to max encodable value: 1.75 → fp16 = 0x3F00.
+    // Matches Triton: encode saturates, 0x7f/0xff are never produced.
     uint16_t abs_fp16 = fp16_bits & 0x7FFFu;
-    if (abs_fp16 > 0x3B80u) abs_fp16 = 0x3B80u;
+    if (abs_fp16 > 0x3F00u) abs_fp16 = 0x3F00u;
 
     // Reconstruct with sign.
     uint16_t sign16 = fp16_bits & 0x8000u;
 
-    // Triton-style: fp16 → fp8.
-    // fp16 layout: [S:1][E:5][M:10] (bias=15)
-    // fp8 layout:  [S:1][E:4][M:3]  (bias=15)
-    //
-    // mad.lo.u32 a0, a0, 2, 0x00800080  →  (abs_fp16 * 2 + 0x0080)
-    // This shifts left by 1 (undoing the right-shift in decode) and adds rounding bias.
-    // Then: lop3.b32 b0, $1, 0x80008000, a0, 0xea  →  (sign & 0x8000) | a0
-    // Finally: prmt for byte extraction.
-    //
-    // Simplified for scalar: shift abs_fp16 left by 1, add rounding bias, take upper byte.
+    // fp16 → fp8: shift abs left by 1 (undo decode's right-shift), add rounding bias, take upper byte.
     uint16_t adjusted = (uint16_t)(abs_fp16 * 2u + 0x0080u);
-    // The upper byte now contains [E:4][M:3][round_bit].
-    // Combine with sign and extract.
     uint16_t with_sign = sign16 | adjusted;
     uint8_t result = (uint8_t)(with_sign >> 8);
 
-    // Zero → 0x00 (ensure positive zero, not negative zero which is NaN).
-    if ((result & 0x7Fu) == 0) result = 0x00u;
-
     return result;
   }
 };
@@ -199,16 +163,18 @@ namespace mscclpp {
 
 /// Data types supported by mscclpp operations.
 enum class DataType {
-  INT32,           // 32-bit signed integer.
-  UINT32,          // 32-bit unsigned integer.
-  FLOAT16,         // IEEE 754 half precision.
-  FLOAT32,         // IEEE 754 single precision.
-  BFLOAT16,        // bfloat16 precision.
-  FLOAT8_E4M3,     // float8 with E4M3 layout.
-  FLOAT8_E5M2,     // float8 with E5M2 layout.
-  UINT8,           // 8-bit unsigned integer.
-  FLOAT8_E4M3B15,  // float8 with E4M3 layout, bias=15 (software, no HW accel).
-  AUTO = 255,      // Sentinel: resolve to the input dtype at runtime.
+  INT32,            // 32-bit signed integer.
+  UINT32,           // 32-bit unsigned integer.
+  FLOAT16,          // IEEE 754 half precision.
+  FLOAT32,          // IEEE 754 single precision.
+  BFLOAT16,         // bfloat16 precision.
+  FLOAT8_E4M3FN,    // float8 E4M3, OCP variant (NV; AMD HIP > 6 with OCP enabled).
+  FLOAT8_E4M3FNUZ,  // float8 E4M3, fnuz variant (AMD HIP 6, or HIP > 6 with FNUZ enabled).
+  FLOAT8_E5M2,      // float8 E5M2, OCP variant (NV; AMD HIP > 6 with OCP enabled).
+  FLOAT8_E5M2FNUZ,  // float8 E5M2, fnuz variant (AMD HIP 6, or HIP > 6 with FNUZ enabled).
+  UINT8,            // 8-bit unsigned integer.
+  FLOAT8_E4M3B15,   // float8 with E4M3 layout, bias=15 (software, no HW accel).
+  AUTO = 255,       // Sentinel: resolve to the input dtype at runtime.
 };
 
 /// Word array.
@@ -1137,11 +1103,11 @@ MSCCLPP_DEVICE_INLINE f8_e4m3b15x2 to<f8_e4m3b15x2, f16x2>(const f16x2& v) {
 #if defined(MSCCLPP_DEVICE_CUDA)
   uint32_t in0;
   asm("mov.b32 %0, %1;" : "=r"(in0) : "r"(*reinterpret_cast<const uint32_t*>(&v)));
-  // Clamp abs to max finite e4m3b15 (0x3B80 = 0.9375 in fp16).
+  // Clamp abs to max encodable e4m3b15 (0x3F00 = 1.75 in fp16).
   uint32_t lo = in0 & 0xFFFFu, hi = in0 >> 16;
   uint32_t alo = lo & 0x7FFFu, ahi = hi & 0x7FFFu;
-  alo = alo < 0x3B80u ? alo : 0x3B80u;
-  ahi = ahi < 0x3B80u ? ahi : 0x3B80u;
+  alo = alo < 0x3F00u ? alo : 0x3F00u;
+  ahi = ahi < 0x3F00u ? ahi : 0x3F00u;
   uint32_t a0 = alo | (ahi << 16);
   a0 = a0 * 2u + 0x00800080u;
   uint32_t b0 = a0 | (in0 & 0x80008000u);
@@ -1152,7 +1118,7 @@ MSCCLPP_DEVICE_INLINE f8_e4m3b15x2 to<f8_e4m3b15x2, f16x2>(const f16x2& v) {
   uint32_t in0 = v.words[0];
   uint32_t abs0 = in0 & 0x7fff7fffu;
   uint32_t a0;
-  asm volatile("v_pk_min_u16 %0, %1, %2" : "=v"(a0) : "v"(abs0), "v"(0x3B803B80u));
+  asm volatile("v_pk_min_u16 %0, %1, %2" : "=v"(a0) : "v"(abs0), "v"(0x3F003F00u));
   a0 = a0 * 2u + 0x00800080u;
   uint32_t b0 = a0 | (in0 & 0x80008000u);
   uint16_t packed = (uint16_t)(((b0 >> 8) & 0xFFu) | ((b0 >> 16) & 0xFF00u));
@@ -1175,8 +1141,8 @@ MSCCLPP_DEVICE_INLINE f8_e4m3b15x4 to<f8_e4m3b15x4, f16x4>(const f16x4& v) {
   asm("mov.b32 %0, %1;" : "=r"(in1) : "r"(v.words[1]));
   uint32_t abs0 = in0 & 0x7fff7fffu;
   uint32_t abs1 = in1 & 0x7fff7fffu;
-  uint32_t a0 = __vminu2(abs0, 0x3B803B80u);
-  uint32_t a1 = __vminu2(abs1, 0x3B803B80u);
+  uint32_t a0 = __vminu2(abs0, 0x3F003F00u);
+  uint32_t a1 = __vminu2(abs1, 0x3F003F00u);
   a0 = a0 * 2u + 0x00800080u;
   a1 = a1 * 2u + 0x00800080u;
   uint32_t b0, b1;
@@ -1189,8 +1155,8 @@ MSCCLPP_DEVICE_INLINE f8_e4m3b15x4 to<f8_e4m3b15x4, f16x4>(const f16x4& v) {
   uint32_t in0 = v.words[0], in1 = v.words[1];
   uint32_t abs0 = in0 & 0x7fff7fffu, abs1 = in1 & 0x7fff7fffu;
   uint32_t a0, a1;
-  asm volatile("v_pk_min_u16 %0, %1, %2" : "=v"(a0) : "v"(abs0), "v"(0x3B803B80u));
-  asm volatile("v_pk_min_u16 %0, %1, %2" : "=v"(a1) : "v"(abs1), "v"(0x3B803B80u));
+  asm volatile("v_pk_min_u16 %0, %1, %2" : "=v"(a0) : "v"(abs0), "v"(0x3F003F00u));
+  asm volatile("v_pk_min_u16 %0, %1, %2" : "=v"(a1) : "v"(abs1), "v"(0x3F003F00u));
   a0 = a0 * 2u + 0x00800080u;
   a1 = a1 * 2u + 0x00800080u;
   uint32_t b0 = a0 | (in0 & 0x80008000u);
diff --git a/python/csrc/core_py.cpp b/python/csrc/core_py.cpp
index b8649564..a94f9863 100644
--- a/python/csrc/core_py.cpp
+++ b/python/csrc/core_py.cpp
@@ -45,8 +45,10 @@ void register_core(nb::module_& m) {
       .value("float16", DataType::FLOAT16)
       .value("float32", DataType::FLOAT32)
       .value("bfloat16", DataType::BFLOAT16)
-      .value("float8_e4m3", DataType::FLOAT8_E4M3)
+      .value("float8_e4m3fn", DataType::FLOAT8_E4M3FN)
+      .value("float8_e4m3fnuz", DataType::FLOAT8_E4M3FNUZ)
       .value("float8_e5m2", DataType::FLOAT8_E5M2)
+      .value("float8_e5m2fnuz", DataType::FLOAT8_E5M2FNUZ)
       .value("uint8", DataType::UINT8)
       .value("float8_e4m3b15", DataType::FLOAT8_E4M3B15);
 
@@ -328,4 +330,4 @@ NB_MODULE(_mscclpp, m) {
 
   // ext
   register_algorithm_collection_builder(m);
-}
\ No newline at end of file
+}
diff --git a/python/mscclpp/utils.py b/python/mscclpp/utils.py
index 93cd786b..0f0a28d4 100644
--- a/python/mscclpp/utils.py
+++ b/python/mscclpp/utils.py
@@ -192,12 +192,14 @@ def torch_dtype_to_mscclpp_dtype(dtype: "torch.dtype") -> DataType:
         return DataType.int32
     elif dtype == torch.bfloat16:
         return DataType.bfloat16
-    # Hardware supports either OCP format or FNUZ format for float8.
-    # Mapping both to the same MSCClPP data type.
-    elif dtype == torch.float8_e5m2 or dtype == torch.float8_e5m2fnuz:
+    elif dtype == torch.float8_e5m2:
         return DataType.float8_e5m2
-    elif dtype == torch.float8_e4m3fn or dtype == torch.float8_e4m3fnuz:
-        return DataType.float8_e4m3
+    elif dtype == torch.float8_e5m2fnuz:
+        return DataType.float8_e5m2fnuz
+    elif dtype == torch.float8_e4m3fn:
+        return DataType.float8_e4m3fn
+    elif dtype == torch.float8_e4m3fnuz:
+        return DataType.float8_e4m3fnuz
     elif dtype == torch.uint8:
         return DataType.uint8
     else:
diff --git a/python/test/test_fp8_accum.py b/python/test/test_fp8_accum.py
index 82981ce1..ba33c085 100644
--- a/python/test/test_fp8_accum.py
+++ b/python/test/test_fp8_accum.py
@@ -21,6 +21,13 @@ from .mscclpp_mpi import MpiGroup, parametrize_mpi_groups, mpi_group
 # FP8 E4M3 (hardware) requires SM >= 89 (Ada / Hopper) on NVIDIA GPUs.
 # On AMD/ROCm (e.g. MI300X), FP8 is supported natively — no skip needed.
 _is_hip = hasattr(cp.cuda.runtime, "is_hip") and cp.cuda.runtime.is_hip
+_gcn_arch_name = ""
+if _is_hip:
+    _gcn_arch_name = cp.cuda.runtime.getDeviceProperties(0).get("gcnArchName", b"")
+    if isinstance(_gcn_arch_name, bytes):
+        _gcn_arch_name = _gcn_arch_name.decode()
+    _gcn_arch_name = _gcn_arch_name.split(":", maxsplit=1)[0]
+_is_cdna4 = _gcn_arch_name.startswith("gfx95")
 _skip_fp8 = not _is_hip and int(cp.cuda.Device().compute_capability) < 89
 pytestmark = pytest.mark.skipif(_skip_fp8, reason="FP8 accum tests require SM >= 89 on CUDA")
 
@@ -90,7 +97,78 @@ def float_to_e4m3fn(f32_array, chunk_size=65536):
 
 
 # ---------------------------------------------------------------------------
-# FP8 E4M3B15 helpers (bias=15, max=0.9375, NaN = exp==15 or bits==0x80)
+# FP8 E4M3FNUZ helpers (AMD/ROCm; bias=8, max=240, NaN = bits==0x80, no -0)
+# ---------------------------------------------------------------------------
+
+
+def e4m3fnuz_to_float(uint8_array):
+    """Decode a cupy uint8 array of E4M3FNUZ bit patterns to float32."""
+    bits = uint8_array.astype(cp.int32)
+    sign = (bits >> 7) & 1
+    exp = (bits >> 3) & 0xF
+    mant = bits & 0x7
+
+    # Normal: (-1)^s * 2^(exp-8) * (1 + mant/8)
+    normal_val = cp.ldexp(cp.float32(1.0) + mant.astype(cp.float32) / cp.float32(8.0), (exp - 8).astype(cp.int32))
+    # Subnormal (exp==0): (-1)^s * 2^(-7) * (mant/8)
+    subnormal_val = cp.ldexp(mant.astype(cp.float32) / cp.float32(8.0), cp.int32(-7))
+
+    result = cp.where(exp == 0, subnormal_val, normal_val)
+    result = cp.where(sign == 1, -result, result)
+    # Zero is only 0x00; the 0x80 encoding is reserved for NaN under fnuz.
+    result = cp.where(uint8_array.astype(cp.int32) == 0, cp.float32(0.0), result)
+    nan_mask = uint8_array.astype(cp.int32) == 0x80
+    result = cp.where(nan_mask, cp.float32(float("nan")), result)
+    return result
+
+
+def float_to_e4m3fnuz(f32_array, chunk_size=65536):
+    """Encode a cupy float32 array to uint8 E4M3FNUZ bit patterns.
+
+    Same lookup-table approach as float_to_e4m3fn but using the fnuz table.
+    """
+    all_bytes = cp.arange(128, dtype=cp.uint8)
+    all_floats = e4m3fnuz_to_float(all_bytes)
+    all_floats = cp.where(cp.isnan(all_floats), cp.float32(float("inf")), all_floats)
+
+    clamped = f32_array.astype(cp.float32)
+    clamped = cp.clip(clamped, -240.0, 240.0)
+    signs = (clamped < 0).astype(cp.uint8)
+    absval = cp.abs(clamped)
+
+    result = cp.zeros(absval.shape, dtype=cp.uint8)
+    n = absval.size
+    absval_flat = absval.ravel()
+    result_flat = result.ravel()
+
+    for start in range(0, n, chunk_size):
+        end = min(start + chunk_size, n)
+        chunk = absval_flat[start:end]
+        diffs = cp.abs(chunk[:, None] - all_floats[None, :])
+        result_flat[start:end] = cp.argmin(diffs, axis=1).astype(cp.uint8)
+
+    result = result_flat.reshape(absval.shape)
+    result = result | (signs << 7)
+    # 0x80 is NaN under fnuz (no negative zero). Collapse any encoding that
+    # landed on 0x80 (small negatives quantised to zero magnitude) to 0x00.
+    result = cp.where(result == 0x80, cp.uint8(0), result)
+    return result
+
+
+# Platform-aware E4M3 native helpers: ROCm CDNA4 and CUDA use OCP fn; older ROCm uses fnuz.
+if _is_hip and not _is_cdna4:
+    e4m3_native_to_float = e4m3fnuz_to_float
+    float_to_e4m3_native = float_to_e4m3fnuz
+    fp8_native_dtype = DataType.float8_e4m3fnuz
+else:
+    e4m3_native_to_float = e4m3fn_to_float
+    float_to_e4m3_native = float_to_e4m3fn
+    fp8_native_dtype = DataType.float8_e4m3fn
+
+
+# ---------------------------------------------------------------------------
+# FP8 E4M3B15 helpers (bias=15, encode saturates to ±1.75, no NaN)
+# Matches Triton's fp8e4b15: all 256 bit patterns are finite.
 # ---------------------------------------------------------------------------
 
 
@@ -108,11 +186,6 @@ def e4m3b15_to_float(uint8_array):
 
     result = cp.where(exp == 0, subnormal_val, normal_val)
     result = cp.where(sign == 1, -result, result)
-    # Zero
-    result = cp.where((exp == 0) & (mant == 0), cp.float32(0.0), result)
-    # NaN: exp==15 or negative zero (0x80)
-    nan_mask = (exp == 15) | (uint8_array.astype(cp.int32) == 0x80)
-    result = cp.where(nan_mask, cp.float32(float("nan")), result)
     return result
 
 
@@ -120,18 +193,17 @@ def float_to_e4m3b15(f32_array, chunk_size=65536):
     """Encode a cupy float32 array to uint8 E4M3B15 bit patterns.
 
     Same lookup-table approach as float_to_e4m3fn.
+    Saturates to ±1.75 (0x7e/0xfe), matching Triton's fp8e4b15.
     """
     # Build lookup table of all 128 positive E4M3B15 values (0x00..0x7F)
     all_bytes = cp.arange(128, dtype=cp.uint8)
     all_floats = e4m3b15_to_float(all_bytes)  # (128,) float32
-    # Mark NaN entries as inf so they're never selected as nearest
-    all_floats = cp.where(cp.isnan(all_floats), cp.float32(float("inf")), all_floats)
 
-    # Clamp input and extract sign
-    clamped = f32_array.astype(cp.float32)
-    clamped = cp.clip(clamped, -0.9375, 0.9375)
-    signs = (clamped < 0).astype(cp.uint8)
-    absval = cp.abs(clamped)
+    # Clamp input and extract sign.
+    values = f32_array.astype(cp.float32)
+    signs = cp.signbit(values).astype(cp.uint8)
+    absval = cp.abs(values)
+    absval = cp.clip(absval, cp.float32(0.0), cp.float32(1.75))
 
     result = cp.zeros(absval.shape, dtype=cp.uint8)
     n = absval.size
@@ -148,8 +220,6 @@ def float_to_e4m3b15(f32_array, chunk_size=65536):
     # Combine with sign bit
     result = result_flat.reshape(absval.shape)
     result = result | (signs << 7)
-    # Handle exact zero
-    result = cp.where(absval == 0, cp.uint8(0), result)
     return result
 
 
@@ -226,12 +296,6 @@ def test_fp8_e4m3_accum(mpi_group: MpiGroup, algo_name: str, size: int):
 
     buf = GpuBuffer(size, dtype=cp.uint8)
 
-    accum_configs = [
-        ("fp8_native", DataType.float8_e4m3),
-        ("float16", DataType.float16),
-        ("float32", DataType.float32),
-    ]
-
     # rsag_zero_copy and fullmesh need explicit block/thread counts
     if "rsag" in algo_name:
         nb = max(1, min(32, size // (world_size * 32)))
@@ -243,13 +307,19 @@ def test_fp8_e4m3_accum(mpi_group: MpiGroup, algo_name: str, size: int):
         nb = 0
         nt = 0
 
+    accum_configs = [
+        ("fp8_native", fp8_native_dtype),
+        ("float16", DataType.float16),
+        ("float32", DataType.float32),
+    ]
+
     errors = {}
     for accum_label, accum_dtype in accum_configs:
         # Generate deterministic per-rank data (use numpy to avoid hipRAND issues on ROCm)
         rng = np.random.RandomState(42 + rank)
         src_f32 = cp.asarray(rng.randn(size).astype(np.float32))
         src_f32 = cp.clip(src_f32, -240.0, 240.0)
-        src_fp8 = float_to_e4m3fn(src_f32)
+        src_fp8 = float_to_e4m3_native(src_f32)
 
         # Copy into symmetric buffer
         buf[:] = src_fp8
@@ -260,12 +330,12 @@ def test_fp8_e4m3_accum(mpi_group: MpiGroup, algo_name: str, size: int):
             algo,
             comm_group,
             buf,
-            dtype=DataType.float8_e4m3,
+            dtype=fp8_native_dtype,
             accum_dtype=accum_dtype,
             nblocks=nb,
             nthreads_per_block=nt,
         )
-        result_f32 = e4m3fn_to_float(result)
+        result_f32 = e4m3_native_to_float(result)
 
         # Compute float32 reference: sum all ranks' quantized FP8 inputs in float32
         ref_f32 = cp.zeros(size, dtype=cp.float32)
@@ -273,12 +343,13 @@ def test_fp8_e4m3_accum(mpi_group: MpiGroup, algo_name: str, size: int):
             rng_r = np.random.RandomState(42 + r)
             rank_data = cp.asarray(rng_r.randn(size).astype(np.float32))
             rank_data = cp.clip(rank_data, -240.0, 240.0)
-            rank_data_fp8 = float_to_e4m3fn(rank_data)
-            ref_f32 += e4m3fn_to_float(rank_data_fp8)
+            rank_data_fp8 = float_to_e4m3_native(rank_data)
+            ref_f32 += e4m3_native_to_float(rank_data_fp8)
 
-        # Compute errors
-        abs_err = cp.abs(result_f32 - ref_f32)
-        mean_abs_err = float(cp.mean(abs_err))
+        # Compute errors (only on valid, non-NaN entries)
+        valid = ~cp.isnan(result_f32) & ~cp.isnan(ref_f32)
+        abs_err = cp.abs(result_f32[valid] - ref_f32[valid])
+        mean_abs_err = float(cp.mean(abs_err)) if abs_err.size > 0 else 0.0
         errors[accum_label] = mean_abs_err
 
         # Reset between runs
@@ -341,13 +412,10 @@ def test_fp8_e4m3b15_accum(mpi_group: MpiGroup, algo_name: str, size: int):
 
     errors = {}
     for accum_label, accum_dtype in accum_configs:
-        # Generate deterministic per-rank random uint8 values in valid e4m3b15 range
+        # Generate deterministic per-rank random uint8 values covering the full e4m3b15 range.
+        # All 256 bit patterns are valid (no NaN in this format).
         rng = np.random.RandomState(42 + rank)
-        raw = cp.asarray(rng.randint(0, 0x78, (size,)).astype(np.uint8))
-        signs = cp.asarray(rng.randint(0, 2, (size,)).astype(np.uint8)) << 7
-        src_uint8 = raw | signs
-        # Fix negative zero -> positive zero
-        src_uint8 = cp.where(src_uint8 == 0x80, cp.uint8(0), src_uint8)
+        src_uint8 = cp.asarray(rng.randint(0, 256, (size,)).astype(np.uint8))
 
         # Copy into symmetric buffer
         buf[:] = src_uint8
@@ -371,19 +439,15 @@ def test_fp8_e4m3b15_accum(mpi_group: MpiGroup, algo_name: str, size: int):
         ref_f32 = cp.zeros(size, dtype=cp.float32)
         for r in range(world_size):
             rng_r = np.random.RandomState(42 + r)
-            raw_r = cp.asarray(rng_r.randint(0, 0x78, (size,)).astype(np.uint8))
-            signs_r = cp.asarray(rng_r.randint(0, 2, (size,)).astype(np.uint8)) << 7
-            bits_r = raw_r | signs_r
-            bits_r = cp.where(bits_r == 0x80, cp.uint8(0), bits_r)
+            bits_r = cp.asarray(rng_r.randint(0, 256, (size,)).astype(np.uint8))
             ref_f32 += e4m3b15_to_float(bits_r)
 
         # Clamp reference to e4m3b15 representable range
-        ref_f32 = cp.clip(ref_f32, -0.9375, 0.9375)
+        ref_f32 = cp.clip(ref_f32, -1.75, 1.75)
 
-        # Compute errors (only on valid entries)
-        valid = ~cp.isnan(result_f32) & ~cp.isnan(ref_f32)
-        abs_err = cp.abs(result_f32[valid] - ref_f32[valid])
-        mean_abs_err = float(cp.mean(abs_err)) if abs_err.size > 0 else 0.0
+        # Compute errors
+        abs_err = cp.abs(result_f32 - ref_f32)
+        mean_abs_err = float(cp.mean(abs_err))
         errors[accum_label] = mean_abs_err
 
         algo.reset()
diff --git a/src/core/algorithm.cc b/src/core/algorithm.cc
index ffa53aa8..c0713daa 100644
--- a/src/core/algorithm.cc
+++ b/src/core/algorithm.cc
@@ -3,6 +3,7 @@
 
 #include <filesystem>
 #include <mscclpp/algorithm.hpp>
+#include <mscclpp/errors.hpp>
 #include <mscclpp/gpu_utils.hpp>
 
 #include "logger.hpp"
@@ -182,13 +183,41 @@ CommResult DslAlgorithm::execute(std::shared_ptr<Communicator> comm, const void*
                         stream);
       break;
 #if defined(__FP8_TYPES_EXIST__)
-    case DataType::FLOAT8_E4M3:
-      executor->execute(rank, (__fp8_e4m3*)input, (__fp8_e4m3*)output, inputSize, outputSize, DataType::FLOAT8_E4M3,
+    case DataType::FLOAT8_E4M3FN:
+#if defined(__FP8_E4M3_IS_FNUZ__)
+      THROW(EXEC, Error, ErrorCode::InvalidUsage,
+            "FLOAT8_E4M3FN is not natively supported on this platform; use FLOAT8_E4M3FNUZ");
+#else
+      executor->execute(rank, (__fp8_e4m3*)input, (__fp8_e4m3*)output, inputSize, outputSize, DataType::FLOAT8_E4M3FN,
                         plan_, stream);
+#endif
+      break;
+    case DataType::FLOAT8_E4M3FNUZ:
+#if defined(__FP8_E4M3_IS_FNUZ__)
+      executor->execute(rank, (__fp8_e4m3*)input, (__fp8_e4m3*)output, inputSize, outputSize, DataType::FLOAT8_E4M3FNUZ,
+                        plan_, stream);
+#else
+      THROW(EXEC, Error, ErrorCode::InvalidUsage,
+            "FLOAT8_E4M3FNUZ is not natively supported on this platform; use FLOAT8_E4M3FN");
+#endif
       break;
     case DataType::FLOAT8_E5M2:
+#if defined(__FP8_E5M2_IS_FNUZ__)
+      THROW(EXEC, Error, ErrorCode::InvalidUsage,
+            "FLOAT8_E5M2 is not natively supported on this platform; use FLOAT8_E5M2FNUZ");
+#else
       executor->execute(rank, (__fp8_e5m2*)input, (__fp8_e5m2*)output, inputSize, outputSize, DataType::FLOAT8_E5M2,
                         plan_, stream);
+#endif
+      break;
+    case DataType::FLOAT8_E5M2FNUZ:
+#if defined(__FP8_E5M2_IS_FNUZ__)
+      executor->execute(rank, (__fp8_e5m2*)input, (__fp8_e5m2*)output, inputSize, outputSize, DataType::FLOAT8_E5M2FNUZ,
+                        plan_, stream);
+#else
+      THROW(EXEC, Error, ErrorCode::InvalidUsage,
+            "FLOAT8_E5M2FNUZ is not natively supported on this platform; use FLOAT8_E5M2");
+#endif
       break;
 #endif
     case DataType::FLOAT8_E4M3B15:
@@ -230,4 +259,4 @@ std::pair<std::shared_ptr<void>, size_t> getFlagBuffer() {
   return {ptr, gDefaultFlagCount * sizeof(uint32_t)};
 }
 
-}  // namespace mscclpp
\ No newline at end of file
+}  // namespace mscclpp
diff --git a/src/core/executor/execution_kernel.cu b/src/core/executor/execution_kernel.cu
index 28ced77f..d639efb7 100644
--- a/src/core/executor/execution_kernel.cu
+++ b/src/core/executor/execution_kernel.cu
@@ -78,8 +78,10 @@ void ExecutionKernel::launchKernel(int rank, int nthreadblocks, int nthreads, vo
       );
 #endif
       break;
-    case DataType::FLOAT8_E4M3:
+    case DataType::FLOAT8_E4M3FN:
+    case DataType::FLOAT8_E4M3FNUZ:
     case DataType::FLOAT8_E5M2:
+    case DataType::FLOAT8_E5M2FNUZ:
       // FP8 is not supported in CUDA execution kernel.
       break;
     case DataType::FLOAT8_E4M3B15:
diff --git a/src/core/executor/executor.cc b/src/core/executor/executor.cc
index bf2caf97..fcecc4dd 100644
--- a/src/core/executor/executor.cc
+++ b/src/core/executor/executor.cc
@@ -7,7 +7,6 @@
 #include <mscclpp/switch_channel.hpp>
 #include <mscclpp/utils.hpp>
 
-#include "debug.h"
 #include "execution_kernel.hpp"
 #include "execution_plan.hpp"
 
@@ -509,8 +508,7 @@ Executor::Executor(std::shared_ptr<Communicator> comm, std::shared_ptr<char> def
 void Executor::execute(int rank, void* sendbuff, void* recvbuff, size_t sendBuffSize,
                        [[maybe_unused]] size_t recvBuffSize, DataType dataType, const ExecutionPlan& plan,
                        cudaStream_t stream, PacketType packetType) {
-  INFO(MSCCLPP_EXECUTOR, "Starting execution with plan: %s, collective: %s", plan.name().c_str(),
-       plan.collective().c_str());
+  INFO(LogSubsys::EXEC, "Starting execution with plan: ", plan.name(), ", collective: ", plan.collective());
   size_t sendMemRange, recvMemRange;
   CUdeviceptr sendBasePtr, recvBasePtr;
   MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendMemRange, (CUdeviceptr)sendbuff));
diff --git a/src/core/include/execution_kernel.hpp b/src/core/include/execution_kernel.hpp
index 87b88888..cb808bc8 100644
--- a/src/core/include/execution_kernel.hpp
+++ b/src/core/include/execution_kernel.hpp
@@ -17,6 +17,7 @@
 #include <mscclpp/switch_channel_device.hpp>
 
 #include "execution_common.hpp"
+#include "logger.hpp"
 #include "reduce_kernel.hpp"
 namespace mscclpp {
 
@@ -876,7 +877,19 @@ class ExecutionKernel {
 #endif
         break;
 #if defined(__FP8_TYPES_EXIST__)
-      case DataType::FLOAT8_E4M3:
+      case DataType::FLOAT8_E4M3FN:
+      case DataType::FLOAT8_E4M3FNUZ:
+#if defined(__FP8_E4M3_IS_FNUZ__)
+        if (dataType == DataType::FLOAT8_E4M3FN) {
+          THROW(LogSubsys::EXEC, Error, ErrorCode::InvalidUsage,
+                "FLOAT8_E4M3FN is not natively supported on this platform; use FLOAT8_E4M3FNUZ");
+        }
+#else
+        if (dataType == DataType::FLOAT8_E4M3FNUZ) {
+          THROW(LogSubsys::EXEC, Error, ErrorCode::InvalidUsage,
+                "FLOAT8_E4M3FNUZ is not natively supported on this platform; use FLOAT8_E4M3FN");
+        }
+#endif
         executionKernel<__fp8_e4m3, PacketType, ReuseScratch><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(
             rank, (__fp8_e4m3*)src, (__fp8_e4m3*)dst, (__fp8_e4m3*)scratch, scratchOffset, scratchChunkSize, plan,
             semaphores, localMemoryIdBegin, flag
@@ -888,6 +901,18 @@ class ExecutionKernel {
 #endif
         break;
       case DataType::FLOAT8_E5M2:
+      case DataType::FLOAT8_E5M2FNUZ:
+#if defined(__FP8_E5M2_IS_FNUZ__)
+        if (dataType == DataType::FLOAT8_E5M2) {
+          THROW(LogSubsys::EXEC, Error, ErrorCode::InvalidUsage,
+                "FLOAT8_E5M2 is not natively supported on this platform; use FLOAT8_E5M2FNUZ");
+        }
+#else
+        if (dataType == DataType::FLOAT8_E5M2FNUZ) {
+          THROW(LogSubsys::EXEC, Error, ErrorCode::InvalidUsage,
+                "FLOAT8_E5M2FNUZ is not natively supported on this platform; use FLOAT8_E5M2");
+        }
+#endif
         executionKernel<__fp8_e5m2, PacketType, ReuseScratch><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(
             rank, (__fp8_e5m2*)src, (__fp8_e5m2*)dst, (__fp8_e5m2*)scratch, scratchOffset, scratchChunkSize, plan,
             semaphores, localMemoryIdBegin, flag
diff --git a/src/ext/collectives/allreduce/allreduce_packet.cu b/src/ext/collectives/allreduce/allreduce_packet.cu
index e2d8ef73..6199f192 100644
--- a/src/ext/collectives/allreduce/allreduce_packet.cu
+++ b/src/ext/collectives/allreduce/allreduce_packet.cu
@@ -200,7 +200,8 @@ inline std::pair<int, int> getDefaultBlockNumAndThreadNum(size_t inputSize, int
   {
     bool isFp8 = dtype == DataType::FLOAT8_E4M3B15;
 #if defined(__FP8_TYPES_EXIST__)
-    isFp8 = isFp8 || dtype == DataType::FLOAT8_E4M3 || dtype == DataType::FLOAT8_E5M2;
+    isFp8 = isFp8 || dtype == DataType::FLOAT8_E4M3FN || dtype == DataType::FLOAT8_E4M3FNUZ ||
+            dtype == DataType::FLOAT8_E5M2 || dtype == DataType::FLOAT8_E5M2FNUZ;
 #endif
     if (isFp8) {
       if (inputSize < (64 << 10)) {
@@ -310,4 +311,4 @@ std::shared_ptr<Algorithm> AllreducePacket::build() {
 }
 
 }  // namespace collective
-}  // namespace mscclpp
\ No newline at end of file
+}  // namespace mscclpp
diff --git a/src/ext/collectives/include/allreduce/common.hpp b/src/ext/collectives/include/allreduce/common.hpp
index 1e0e7e69..93b18e26 100644
--- a/src/ext/collectives/include/allreduce/common.hpp
+++ b/src/ext/collectives/include/allreduce/common.hpp
@@ -101,10 +101,20 @@ AllreduceFunc dispatchByDtype(mscclpp::DataType dtype, mscclpp::DataType accumDt
     return Adapter<Op, __bfloat16, __bfloat16>::call;
 #endif
 #if defined(__FP8_TYPES_EXIST__)
-  } else if (dtype == mscclpp::DataType::FLOAT8_E4M3) {
+#if defined(__FP8_E4M3_IS_FNUZ__)
+  } else if (dtype == mscclpp::DataType::FLOAT8_E4M3FNUZ) {
     return dispatchFp8Accum<Op, __fp8_e4m3, Adapter>(accumDtype, dtype);
+#else
+  } else if (dtype == mscclpp::DataType::FLOAT8_E4M3FN) {
+    return dispatchFp8Accum<Op, __fp8_e4m3, Adapter>(accumDtype, dtype);
+#endif
+#if defined(__FP8_E5M2_IS_FNUZ__)
+  } else if (dtype == mscclpp::DataType::FLOAT8_E5M2FNUZ) {
+    return dispatchFp8Accum<Op, __fp8_e5m2, Adapter>(accumDtype, dtype);
+#else
   } else if (dtype == mscclpp::DataType::FLOAT8_E5M2) {
     return dispatchFp8Accum<Op, __fp8_e5m2, Adapter>(accumDtype, dtype);
+#endif
 #endif
   } else if (dtype == mscclpp::DataType::FLOAT8_E4M3B15) {
     return dispatchFp8Accum<Op, __fp8_e4m3b15, Adapter>(accumDtype, dtype);
@@ -125,4 +135,4 @@ AllreduceFunc dispatch(ReduceOp op, mscclpp::DataType dtype, mscclpp::DataType a
 }  // namespace collective
 }  // namespace mscclpp
 
-#endif  // MSCCLPP_ALLREDUCE_COMMON_HPP_
\ No newline at end of file
+#endif  // MSCCLPP_ALLREDUCE_COMMON_HPP_
diff --git a/src/ext/nccl/algorithm_selector.cc b/src/ext/nccl/algorithm_selector.cc
index 0b9592d7..c94aab34 100644
--- a/src/ext/nccl/algorithm_selector.cc
+++ b/src/ext/nccl/algorithm_selector.cc
@@ -20,7 +20,8 @@ static bool isNvlsSupportedForDataType(const AlgorithmSelectorConfig& config, Da
     return false;
   }
 
-  const bool isFp8 = dtype == DataType::FLOAT8_E4M3 || dtype == DataType::FLOAT8_E5M2;
+  const bool isFp8 = dtype == DataType::FLOAT8_E4M3FN || dtype == DataType::FLOAT8_E4M3FNUZ ||
+                     dtype == DataType::FLOAT8_E5M2 || dtype == DataType::FLOAT8_E5M2FNUZ;
 
   if (!isFp8) {
     return nvlsSupported;
diff --git a/src/ext/nccl/datatype_conversion.hpp b/src/ext/nccl/datatype_conversion.hpp
index dcfb645a..a5c74def 100644
--- a/src/ext/nccl/datatype_conversion.hpp
+++ b/src/ext/nccl/datatype_conversion.hpp
@@ -28,12 +28,21 @@ inline mscclpp::DataType ncclDataTypeToMscclpp(ncclDataType_t dtype) {
       return mscclpp::DataType::BFLOAT16;
 #ifdef __FP8_TYPES_EXIST__
     case ncclFloat8e4m3:
-      return mscclpp::DataType::FLOAT8_E4M3;
+#if defined(__FP8_E4M3_IS_FNUZ__)
+      return mscclpp::DataType::FLOAT8_E4M3FNUZ;
+#else
+      return mscclpp::DataType::FLOAT8_E4M3FN;
+#endif
     case ncclFloat8e5m2:
+#if defined(__FP8_E5M2_IS_FNUZ__)
+      return mscclpp::DataType::FLOAT8_E5M2FNUZ;
+#else
       return mscclpp::DataType::FLOAT8_E5M2;
+#endif
 #endif
     default:
-      throw mscclpp::Error("Unsupported ncclDataType_t: " + std::to_string(dtype), mscclpp::ErrorCode::InvalidUsage);
+      THROW(mscclpp::LogSubsys::NCCL, mscclpp::Error, mscclpp::ErrorCode::InvalidUsage,
+            "Unsupported ncclDataType_t: " + std::to_string(dtype));
   }
 }
 
@@ -41,8 +50,10 @@ inline mscclpp::DataType ncclDataTypeToMscclpp(ncclDataType_t dtype) {
 inline size_t getDataTypeSize(mscclpp::DataType dtype) {
   switch (dtype) {
     case mscclpp::DataType::UINT8:
-    case mscclpp::DataType::FLOAT8_E4M3:
+    case mscclpp::DataType::FLOAT8_E4M3FN:
+    case mscclpp::DataType::FLOAT8_E4M3FNUZ:
     case mscclpp::DataType::FLOAT8_E5M2:
+    case mscclpp::DataType::FLOAT8_E5M2FNUZ:
     case mscclpp::DataType::FLOAT8_E4M3B15:
       return 1;
     case mscclpp::DataType::FLOAT16:
@@ -72,10 +83,32 @@ static inline ncclDataType_t mscclppToNcclDataType(mscclpp::DataType dtype) {
     case mscclpp::DataType::BFLOAT16:
       return ncclBfloat16;
 #ifdef __FP8_TYPES_EXIST__
-    case mscclpp::DataType::FLOAT8_E4M3:
+#if defined(__FP8_E4M3_IS_FNUZ__)
+    case mscclpp::DataType::FLOAT8_E4M3FNUZ:
       return ncclFloat8e4m3;
+    case mscclpp::DataType::FLOAT8_E4M3FN:
+      THROW(mscclpp::LogSubsys::NCCL, mscclpp::Error, mscclpp::ErrorCode::InvalidUsage,
+            "FLOAT8_E4M3FN is not natively supported on this platform; use FLOAT8_E4M3FNUZ for NCCL collectives");
+#else
+    case mscclpp::DataType::FLOAT8_E4M3FN:
+      return ncclFloat8e4m3;
+    case mscclpp::DataType::FLOAT8_E4M3FNUZ:
+      THROW(mscclpp::LogSubsys::NCCL, mscclpp::Error, mscclpp::ErrorCode::InvalidUsage,
+            "FLOAT8_E4M3FNUZ is not natively supported on this platform; use FLOAT8_E4M3FN for NCCL collectives");
+#endif
+#if defined(__FP8_E5M2_IS_FNUZ__)
+    case mscclpp::DataType::FLOAT8_E5M2FNUZ:
+      return ncclFloat8e5m2;
+    case mscclpp::DataType::FLOAT8_E5M2:
+      THROW(mscclpp::LogSubsys::NCCL, mscclpp::Error, mscclpp::ErrorCode::InvalidUsage,
+            "FLOAT8_E5M2 is not natively supported on this platform; use FLOAT8_E5M2FNUZ for NCCL collectives");
+#else
     case mscclpp::DataType::FLOAT8_E5M2:
       return ncclFloat8e5m2;
+    case mscclpp::DataType::FLOAT8_E5M2FNUZ:
+      THROW(mscclpp::LogSubsys::NCCL, mscclpp::Error, mscclpp::ErrorCode::InvalidUsage,
+            "FLOAT8_E5M2FNUZ is not natively supported on this platform; use FLOAT8_E5M2 for NCCL collectives");
+#endif
 #endif
     case mscclpp::DataType::FLOAT8_E4M3B15:
       // float8_e4m3b15 has no NCCL equivalent; NCCL cannot reduce this type correctly.
@@ -98,4 +131,4 @@ inline mscclpp::ReduceOp ncclRedOpToMscclpp(ncclRedOp_t op) {
   }
 }
 
-#endif  // MSCCLPP_DATATYPE_CONVERSION_HPP_
\ No newline at end of file
+#endif  // MSCCLPP_DATATYPE_CONVERSION_HPP_

From 9ec26fa4d11325ca33dd4dca83b99dee9146e6bf Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Mon, 4 May 2026 15:11:47 -0700
Subject: [PATCH 04/12] Reset GPU tokens before reuse (#795)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes a token-reuse bug in `TokenPool` that's independent of MNNVL.

## Bug

`TokenPool` hands out 8-byte device-memory slots used as
device-semaphore counters. The deleter only cleared the bitmap — the
underlying GPU memory was left as-is. When a token was freed and later
re-allocated, the new semaphore inherited the previous counter value
instead of starting at 0, breaking subsequent `signal()/wait()` math.

## Fix

* Add a synchronous `gpuMemset` host helper (mirrors `gpuMemcpy` /
`gpuMemcpyAsync`).
* Zero the slot inside the `TokenPool` deleter so recycled tokens hand
out a clean counter. The very-first allocation is already zeroed by
`gpuCallocPhysical` (`src/core/gpu_utils.cc:227-228`), so first-time
tokens are also clean — the deleter only has to handle the recycle case.

## Notes

* Public wrapper is named `mscclpp::gpuMemset` (not `mscclpp::memset`)
for symmetry with `gpuMemcpy` and to avoid shadowing `std::memset` in
TUs that pull the namespace in.
* Zeroing happens on release rather than acquire so the cost is paid in
the typically less perf-sensitive teardown path.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

---------

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 include/mscclpp/gpu_utils.hpp | 7 +++++++
 src/core/gpu_utils.cc         | 7 +++++++
 src/core/utils_internal.cc    | 3 +++
 3 files changed, 17 insertions(+)

diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp
index ecd13c47..b079e0fd 100644
--- a/include/mscclpp/gpu_utils.hpp
+++ b/include/mscclpp/gpu_utils.hpp
@@ -165,6 +165,7 @@ void gpuFreePhysical(void* ptr);
 void gpuMemcpyAsync(void* dst, const void* src, size_t bytes, cudaStream_t stream,
                     cudaMemcpyKind kind = cudaMemcpyDefault);
 void gpuMemcpy(void* dst, const void* src, size_t bytes, cudaMemcpyKind kind = cudaMemcpyDefault);
+void gpuMemset(void* ptr, int value, size_t bytes);
 
 /// A template function that allocates memory while ensuring that the memory will be freed when the returned object is
 /// destroyed.
@@ -300,6 +301,12 @@ void gpuMemcpy(T* dst, const T* src, size_t nelems, cudaMemcpyKind kind = cudaMe
   detail::gpuMemcpy(dst, src, nelems * sizeof(T), kind);
 }
 
+/// Sets `bytes` of memory at `ptr` to `value` synchronously.
+/// @param ptr Destination address.
+/// @param value Value to set (interpreted as unsigned char per CUDA semantics).
+/// @param bytes Number of bytes to set.
+inline void gpuMemset(void* ptr, int value, size_t bytes) { detail::gpuMemset(ptr, value, bytes); }
+
 /// Check if NVLink SHARP (NVLS) is supported.
 ///
 /// @return True if NVLink SHARP (NVLS) is supported, false otherwise.
diff --git a/src/core/gpu_utils.cc b/src/core/gpu_utils.cc
index 09d5025d..1ce61322 100644
--- a/src/core/gpu_utils.cc
+++ b/src/core/gpu_utils.cc
@@ -267,6 +267,13 @@ void gpuMemcpy(void* dst, const void* src, size_t bytes, cudaMemcpyKind kind) {
   MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
 }
 
+void gpuMemset(void* ptr, int value, size_t bytes) {
+  AvoidCudaGraphCaptureGuard cgcGuard;
+  CudaStreamWithFlags stream(cudaStreamNonBlocking);
+  MSCCLPP_CUDATHROW(cudaMemsetAsync(ptr, value, bytes, stream));
+  MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
+}
+
 }  // namespace detail
 
 bool isNvlsSupported() {
diff --git a/src/core/utils_internal.cc b/src/core/utils_internal.cc
index 9504a52c..8cc55430 100644
--- a/src/core/utils_internal.cc
+++ b/src/core/utils_internal.cc
@@ -248,6 +248,9 @@ TokenPool::TokenPool(size_t nToken) : nToken_(nToken) {
 
 std::shared_ptr<uint64_t> TokenPool::getToken() {
   auto deleter = [self = shared_from_this()](uint64_t* token) {
+    // Zero the slot on release so the next allocator hands out a clean
+    // semaphore counter (matches a freshly-allocated slot).
+    mscclpp::gpuMemset(token, 0, sizeof(uint64_t));
     size_t index = (token - self->baseAddr_) / UINT64_WIDTH;
     size_t bit = (token - self->baseAddr_) % UINT64_WIDTH;
     uint64_t mask = 1UL << bit;

From 822fbb235168d3bf85112bdf53a55cbfc86e4737 Mon Sep 17 00:00:00 2001
From: Mahdieh Ghazi <mahdiehghazi@microsoft.com>
Date: Tue, 5 May 2026 17:17:41 -0400
Subject: [PATCH 05/12] Adding necessary macros for enabling mrc support (#797)

This PR adds necessary macros and instructions for enabling mrc support with no atomic.
---
 CMakeLists.txt              |  1 +
 docs/quickstart.md          | 39 +++++++++++++++++++++++++++++++++++++
 src/core/CMakeLists.txt     |  4 ++++
 src/core/ibverbs_wrapper.cc | 36 ++++++++++++++++++++++++++++++++++
 4 files changed, 80 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ef8b785a..49154e0b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -54,6 +54,7 @@ option(MSCCLPP_BUILD_EXT_COLLECTIVES "Build collective algorithms" ON)
 option(MSCCLPP_USE_CUDA "Use NVIDIA/CUDA." OFF)
 option(MSCCLPP_USE_ROCM "Use AMD/ROCm." OFF)
 option(MSCCLPP_USE_IB "Use InfiniBand." ON)
+option(MSCCLPP_USE_MRC "Enable MRC support" OFF)
 option(MSCCLPP_BYPASS_GPU_CHECK "Bypass GPU check." OFF)
 option(MSCCLPP_NPKIT_FLAGS "Set NPKIT flags" OFF)
 option(MSCCLPP_ENABLE_COVERAGE "Enable code coverage" OFF)
diff --git a/docs/quickstart.md b/docs/quickstart.md
index 83a08d6a..716fcf61 100644
--- a/docs/quickstart.md
+++ b/docs/quickstart.md
@@ -126,6 +126,45 @@ $ python -m pip install ".[cuda12,benchmark]"
 $ python -m pip install ".[cuda12,benchmark,test]"
 ```
 
+(mrc-support)=
+## MRC Support
+
+MSCCL++ supports execution over **Multi-path Reliable Connection (MRC)**, which enables the use of multiple network paths to improve bandwidth utilization and resilience.
+
+To enable MRC support, you must configure both the **build-time** and **runtime** environments as described below.
+
+---
+
+### 1. Install MRC Verbs Shim
+
+MSCCL++ relies on a custom verbs shim library that intercepts standard `libibverbs` calls and redirects them to an MRC-enabled implementation.
+
+- Install the [MRC verbs shim library](https://github.com/microsoft/mrc-verbs-shim-lib) on all nodes in the cluster.
+- Ensure that the underlying system has MRC support enabled.
+
+---
+
+### 2. Build MSCCL++ with MRC Enabled
+
+Enable MRC support during the build by adding the following CMake option:
+
+```bash
+-DMSCCLPP_USE_MRC=ON
+```
+
+This configures MSCCL++ to use the MRC-enabled verbs layer at runtime.
+
+### 3. Configure Runtime Environment
+
+At runtime, you must configure environment variables to override the default RDMA libraries and link against the MRC-enabled stack:
+
+```bash
+-x MSCCLPP_IBV_SO=:$MRC-SHIM-HOME/libibverbs.so
+-x LD_LIBRARY_PATH=$MRC-SHIM-HOME/mrc-header-lib:$LD_LIBRARY_PATH
+-x VMRC_LIBMRC_SO=/opt/mellanox/doca/lib/aarch64-linux-gnu/libnv_mrc.so"
+-x VMRC_LIBIBVERBS_SO=/lib/aarch64-linux-gnu/libibverbs.so.1
+```
+
 (vscode-dev-container)=
 ## VSCode Dev Container
 
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index 9ca5fed3..5b89eedc 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -59,6 +59,10 @@ if(MSCCLPP_NPKIT_FLAGS)
     target_compile_definitions(mscclpp_obj PRIVATE ${MSCCLPP_NPKIT_FLAGS})
 endif()
 
+if(MSCCLPP_USE_MRC)
+	target_compile_definitions(mscclpp_obj PRIVATE MSCCLPP_USE_MRC)
+endif()
+
 # libmscclpp
 add_library(mscclpp SHARED)
 target_link_libraries(mscclpp PUBLIC mscclpp_obj)
diff --git a/src/core/ibverbs_wrapper.cc b/src/core/ibverbs_wrapper.cc
index 51f3f29c..60ee0694 100644
--- a/src/core/ibverbs_wrapper.cc
+++ b/src/core/ibverbs_wrapper.cc
@@ -10,11 +10,28 @@
 
 #include "logger.hpp"
 
+// Adding MSCCLPP_USE_MRC micro for MRC enablement.
+// Non-MRC environments will not be affected by this macro as long as VMRC_LIBIBVERBS_SO
+// environment variable is not set.
+#if (MSCCLPP_USE_MRC)
+#include <cstdlib>
+#include <set>
+#endif  // (MSCCLPP_USE_MRC)
+
 namespace mscclpp {
 
 static std::unique_ptr<void, int (*)(void*)> globalIBVerbsHandle(nullptr, &::dlclose);
+#if (MSCCLPP_USE_MRC)
+static std::unique_ptr<void, int (*)(void*)> globalOrigIBVerbsHandle(nullptr, &::dlclose);
+#endif  // (MSCCLPP_USE_MRC)
 
 void* IBVerbs::dlsym(const std::string& symbol, bool allowReturnNull) {
+#if (MSCCLPP_USE_MRC)
+  static std::set<std::string> mrcSymbols = {
+      "ibv_get_device_list", "ibv_get_device_name", "ibv_open_device", "ibv_close_device", "ibv_query_qp",
+      "ibv_create_cq",       "ibv_destroy_cq",      "ibv_create_qp",   "ibv_modify_qp",    "ibv_destroy_qp",
+  };
+#endif  // (MSCCLPP_USE_MRC)
   if (!globalIBVerbsHandle) {
     if (mscclpp::env()->ibvSo != "") {
       void* handle = ::dlopen(mscclpp::env()->ibvSo.c_str(), RTLD_NOW);
@@ -38,7 +55,26 @@ void* IBVerbs::dlsym(const std::string& symbol, bool allowReturnNull) {
       THROW(NET, SysError, errno, "Failed to open libibverbs: ", std::string(::dlerror()));
     }
   }
+#if (MSCCLPP_USE_MRC)
+  // In MRC mode, `VMRC_LIBIBVERBS_SO` should be set.
+  char* vmrcLibibverbsSo = ::getenv("VMRC_LIBIBVERBS_SO");
+  void* ptr;
+  if (vmrcLibibverbsSo != nullptr && mrcSymbols.find(symbol) == mrcSymbols.end()) {
+    // If we are in MRC mode and the symbol is not in the table, get it from the original libibverbs.
+    if (!globalOrigIBVerbsHandle) {
+      void* handle = ::dlopen(vmrcLibibverbsSo, RTLD_NOW);
+      if (!handle) {
+        THROW(NET, SysError, errno, "Failed to open ", std::string(vmrcLibibverbsSo));
+      }
+      globalOrigIBVerbsHandle.reset(handle);
+    }
+    ptr = ::dlsym(globalOrigIBVerbsHandle.get(), symbol.c_str());
+  } else {
+    ptr = ::dlsym(globalIBVerbsHandle.get(), symbol.c_str());
+  }
+#else   // !(MSCCLPP_USE_MRC)
   void* ptr = ::dlsym(globalIBVerbsHandle.get(), symbol.c_str());
+#endif  // !(MSCCLPP_USE_MRC)
   if (!ptr && !allowReturnNull) {
     THROW(NET, SysError, errno, "Failed to load libibverbs symbol: ", symbol);
   }

From 0c9b9abfd512c51e5df4d9b27f39627c3693013b Mon Sep 17 00:00:00 2001
From: Caio Rocha <164253795+caiomcbr@users.noreply.github.com>
Date: Tue, 12 May 2026 13:45:55 -0700
Subject: [PATCH 06/12] Adding Support 4 Nodes AllReduce Small Message Size
 (#794)

Results on 4 Nodes H200:

| Size | NCCL  | MSCCL++ 57TB | MSCCL++ 29TB |
|------|-------|--------------|--------------|
| 8K   | 45.75 | 17.74        | 18.18        |
| 16K  | 47.08 | 18.9         | 18.42        |
| 32K  | 47.29 | 19.48        | 19.12        |
| 64K  | 50.34 | 20.51        | 19.29        |
| 128K | 59.65 | 21.37        | 20.25        |
| 256K | 87.46 | 23.87        | 23.51        |
| 512K | 106.55| 29.15        | 29.51        |
| 1M   | 115   | 40.64        | 41.83        |
| 2M   | 135.89| 63.73        | 70.45        |
| 4M   | 177.59| 121.76       | 128.79       |
| 8M   | 251.17| 228.5        | 251.36       |

---------

Co-authored-by: Binyang Li <binyli@microsoft.com>
Co-authored-by: Caio Rocha <caiorocha@microsof.com>
---
 python/mscclpp/__main__.py                    |  46 ++++++-
 python/mscclpp/default_algos/__init__.py      |   4 +-
 ...uce_2nodes.py => allreduce_multi_nodes.py} | 124 +++++++++++++-----
 .../algorithm_collection_builder.cc           |   4 +-
 4 files changed, 143 insertions(+), 35 deletions(-)
 rename python/mscclpp/default_algos/{allreduce_2nodes.py => allreduce_multi_nodes.py} (61%)

diff --git a/python/mscclpp/__main__.py b/python/mscclpp/__main__.py
index 6a6f5f28..450ec748 100644
--- a/python/mscclpp/__main__.py
+++ b/python/mscclpp/__main__.py
@@ -13,7 +13,7 @@ from mscclpp.language.utils import AlgoSpec
 default_algo_configs = [
     {
         "filename": "allreduce_2nodes_1K_64K.json",
-        "function": def_algo.allreduce_2nodes,
+        "function": def_algo.allreduce_multi_nodes,
         "spec": AlgoSpec(
             name="allreduce_2nodes_1K_64K",
             collective=AllReduce(16, 1, True),
@@ -34,7 +34,7 @@ default_algo_configs = [
     },
     {
         "filename": "allreduce_2nodes_128K_2M.json",
-        "function": def_algo.allreduce_2nodes,
+        "function": def_algo.allreduce_multi_nodes,
         "spec": AlgoSpec(
             name="allreduce_2nodes_128K_2M",
             collective=AllReduce(16, 1, True),
@@ -53,6 +53,48 @@ default_algo_configs = [
         ),
         "additional_kwargs": {"thread_block_group_size": 4},
     },
+    {
+        "filename": "allreduce_4nodes_1K_64K.json",
+        "function": def_algo.allreduce_multi_nodes,
+        "spec": AlgoSpec(
+            name="allreduce_4nodes_1K_64K",
+            collective=AllReduce(32, 1, True),
+            nranks_per_node=8,
+            world_size=32,
+            in_place=True,
+            instances=1,
+            protocol="LL",
+            auto_sync=False,
+            num_threads_per_block=1024,
+            reuse_resources=True,
+            use_double_scratch_buffer=True,
+            min_message_size=1 << 10,
+            max_message_size=64 << 10,
+            tags={"default": 1},
+        ),
+        "additional_kwargs": {"thread_block_group_size": 1},
+    },
+    {
+        "filename": "allreduce_4nodes_128K_2M.json",
+        "function": def_algo.allreduce_multi_nodes,
+        "spec": AlgoSpec(
+            name="allreduce_4nodes_128K_2M",
+            collective=AllReduce(32, 1, True),
+            nranks_per_node=8,
+            world_size=32,
+            in_place=True,
+            instances=1,
+            protocol="LL",
+            auto_sync=False,
+            num_threads_per_block=1024,
+            reuse_resources=True,
+            use_double_scratch_buffer=True,
+            min_message_size=128 << 10,
+            max_message_size=2 << 20,
+            tags={"default": 1},
+        ),
+        "additional_kwargs": {"thread_block_group_size": 4},
+    },
 ]
 
 
diff --git a/python/mscclpp/default_algos/__init__.py b/python/mscclpp/default_algos/__init__.py
index a5cfa882..1767aab6 100644
--- a/python/mscclpp/default_algos/__init__.py
+++ b/python/mscclpp/default_algos/__init__.py
@@ -1,6 +1,6 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-from mscclpp.default_algos.allreduce_2nodes import allreduce_2nodes
+from mscclpp.default_algos.allreduce_multi_nodes import allreduce_multi_nodes
 
-__all__ = ["allreduce_2nodes"]
+__all__ = ["allreduce_multi_nodes"]
diff --git a/python/mscclpp/default_algos/allreduce_2nodes.py b/python/mscclpp/default_algos/allreduce_multi_nodes.py
similarity index 61%
rename from python/mscclpp/default_algos/allreduce_2nodes.py
rename to python/mscclpp/default_algos/allreduce_multi_nodes.py
index 5a355887..5697a0e3 100644
--- a/python/mscclpp/default_algos/allreduce_2nodes.py
+++ b/python/mscclpp/default_algos/allreduce_multi_nodes.py
@@ -2,9 +2,11 @@
 # Licensed under the MIT License.
 
 """
-Multi-node AllReduce implementation using packet-based communication.
-This implements a hierarchical AllReduce: intra-node allreduce followed by
-inter-node exchange and final intra-node allreduce.
+Generalized multi-node AllReduce implementation using packet-based communication.
+This implements a hierarchical AllReduce for N nodes:
+1. Intra-node reduce-scatter (each GPU reduces its assigned chunk across the node)
+2. Inter-node allreduce (exchange fully intra-reduced chunks across all nodes)
+3. Intra-node broadcast (distribute the fully reduced chunks back to all GPUs in the node)
 """
 
 from mscclpp.language.utils import AlgoSpec
@@ -15,7 +17,7 @@ from mscclpp.language.program import *
 from mscclpp.language.collectives import *
 
 
-def allreduce_2nodes(spec: AlgoSpec, thread_block_group_size: int) -> CollectiveProgram:
+def allreduce_multi_nodes(spec: AlgoSpec, thread_block_group_size: int) -> CollectiveProgram:
     """
     Implements a multi-node AllReduce using a hierarchical approach:
     1. Intra-node allreduce
@@ -23,10 +25,10 @@ def allreduce_2nodes(spec: AlgoSpec, thread_block_group_size: int) -> Collective
     3. Intra-node allreduce
     """
     # Configuration constants
-    num_nodes = 2
+    num_nodes = spec.world_size // spec.nranks_per_node
     gpus_per_node = spec.nranks_per_node
     total_gpus = num_nodes * gpus_per_node
-    packets_per_gpu = 2
+    packets_per_gpu = num_nodes
 
     with CollectiveProgram.from_spec(spec) as prog:
         # Initialize communication channels and buffers
@@ -54,11 +56,21 @@ def allreduce_2nodes(spec: AlgoSpec, thread_block_group_size: int) -> Collective
                 )
             )
 
-        scratch_buffer_size = packets_per_gpu * (total_gpus + 1)
+        # Scratch buffer layout (3 contiguous regions):
+        #   Region 1 [0, total_gpus):
+        #     Intra-node reduce-scatter. Each GPU receives chunks from gpus_per_node peers,
+        #     packets_per_gpu each → gpus_per_node * packets_per_gpu = total_gpus slots.
+        #   Region 2 [total_gpus, total_gpus + num_nodes * packets_per_gpu):
+        #     Inter-node exchange. Each GPU receives reduced chunks from num_nodes nodes,
+        #     packets_per_gpu each → num_nodes * packets_per_gpu slots.
+        #   Region 3 [total_gpus + num_nodes * packets_per_gpu, end):
+        #     Intra-node broadcast. Each GPU receives final reduced data from gpus_per_node peers,
+        #     packets_per_gpu each → gpus_per_node * packets_per_gpu = total_gpus slots.
+        # Total = 2 * total_gpus + num_nodes * packets_per_gpu
+        scratch_buffer_size = 2 * total_gpus + packets_per_gpu * num_nodes
         for node_id in range(num_nodes):
             for local_gpu_id in range(gpus_per_node):
                 current_rank_id = local_gpu_id + gpus_per_node * node_id
-                next_node_rank_id = (local_gpu_id + gpus_per_node * (node_id + 1)) % total_gpus
                 scratch_buffers.append(Buffer(current_rank_id, scratch_buffer_size))
                 for peer_gpu_id in range(gpus_per_node):
                     if peer_gpu_id != local_gpu_id:
@@ -66,7 +78,12 @@ def allreduce_2nodes(spec: AlgoSpec, thread_block_group_size: int) -> Collective
                         intra_node_memory_channels[(peer_rank_id, current_rank_id)] = MemoryChannel(
                             peer_rank_id, current_rank_id
                         )
-                inter_node_port_channels[current_rank_id] = PortChannel(next_node_rank_id, current_rank_id)
+                for peer_node_id in range(num_nodes):
+                    if peer_node_id != node_id:
+                        peer_node_rank_id = (local_gpu_id + gpus_per_node * peer_node_id) % total_gpus
+                        inter_node_port_channels[(current_rank_id, peer_node_rank_id)] = PortChannel(
+                            peer_node_rank_id, current_rank_id
+                        )
 
         # AllReduce
         for node_id in range(num_nodes):
@@ -74,7 +91,6 @@ def allreduce_2nodes(spec: AlgoSpec, thread_block_group_size: int) -> Collective
                 current_rank_id = local_gpu_id + gpus_per_node * node_id
                 current_rank = Rank(current_rank_id)
                 input_buffer = current_rank.get_input_buffer()
-                next_node_rank_id = (local_gpu_id + gpus_per_node * (node_id + 1)) % total_gpus
 
                 # Intra Node Exchange Data
                 for peer_gpu_id in range(gpus_per_node):
@@ -118,27 +134,32 @@ def allreduce_2nodes(spec: AlgoSpec, thread_block_group_size: int) -> Collective
                 )
 
                 inter_node_offset = total_gpus
-                inter_node_port_channels[current_rank_id].put_packets(
-                    scratch_buffers[next_node_rank_id][
-                        inter_node_offset
-                        + local_gpu_id * packets_per_gpu : inter_node_offset
-                        + local_gpu_id * packets_per_gpu
-                        + packets_per_gpu
-                    ],
-                    scratch_buffers[current_rank_id][
-                        local_gpu_id * packets_per_gpu : local_gpu_id * packets_per_gpu + packets_per_gpu
-                    ],
-                    tb=0,
-                )
+                for peer_node_id in range(num_nodes):
+                    if peer_node_id != node_id:
+                        peer_node_rank_id = (local_gpu_id + gpus_per_node * peer_node_id) % total_gpus
+                        inter_node_port_channels[(current_rank_id, peer_node_rank_id)].put_packets(
+                            scratch_buffers[peer_node_rank_id][
+                                inter_node_offset
+                                + node_id * packets_per_gpu : inter_node_offset
+                                + node_id * packets_per_gpu
+                                + packets_per_gpu
+                            ],
+                            scratch_buffers[current_rank_id][
+                                local_gpu_id * packets_per_gpu : local_gpu_id * packets_per_gpu + packets_per_gpu
+                            ],
+                            tb=0,
+                        )
 
                 # Reduce Received Data from Remote Node
                 inter_node_data = [
                     scratch_buffers[current_rank_id][
                         inter_node_offset
-                        + local_gpu_id * packets_per_gpu : inter_node_offset
-                        + local_gpu_id * packets_per_gpu
+                        + peer_node_id * packets_per_gpu : inter_node_offset
+                        + peer_node_id * packets_per_gpu
                         + packets_per_gpu
                     ]
+                    for peer_node_id in range(num_nodes)
+                    if peer_node_id != node_id
                 ]
                 current_rank.reduce(
                     input_buffer[local_gpu_id * packets_per_gpu : local_gpu_id * packets_per_gpu + packets_per_gpu],
@@ -148,12 +169,18 @@ def allreduce_2nodes(spec: AlgoSpec, thread_block_group_size: int) -> Collective
                 )
 
                 current_rank.copy_packets(
-                    scratch_buffers[current_rank_id][scratch_buffer_size - packets_per_gpu : scratch_buffer_size],
+                    scratch_buffers[current_rank_id][
+                        inter_node_offset
+                        + node_id * packets_per_gpu : inter_node_offset
+                        + node_id * packets_per_gpu
+                        + packets_per_gpu
+                    ],
                     input_buffer[local_gpu_id * packets_per_gpu : local_gpu_id * packets_per_gpu + packets_per_gpu],
                     tb_group=global_intra_node_tbg,
                 )
 
                 # Broadcast Reduced Data
+                broadcast_offset = total_gpus + packets_per_gpu * num_nodes
                 for peer_gpu_id in range(gpus_per_node):
                     peer_rank_id = peer_gpu_id + gpus_per_node * node_id
 
@@ -161,13 +188,16 @@ def allreduce_2nodes(spec: AlgoSpec, thread_block_group_size: int) -> Collective
                         tbg_id = peer_gpu_id if peer_gpu_id < local_gpu_id else peer_gpu_id - 1
                         intra_node_memory_channels[(peer_rank_id, current_rank_id)].read_put_packets(
                             scratch_buffers[peer_rank_id][
-                                inter_node_offset
-                                + local_gpu_id * packets_per_gpu : inter_node_offset
+                                broadcast_offset
+                                + local_gpu_id * packets_per_gpu : broadcast_offset
                                 + local_gpu_id * packets_per_gpu
                                 + packets_per_gpu
                             ],
                             scratch_buffers[current_rank_id][
-                                scratch_buffer_size - packets_per_gpu : scratch_buffer_size
+                                inter_node_offset
+                                + node_id * packets_per_gpu : inter_node_offset
+                                + node_id * packets_per_gpu
+                                + packets_per_gpu
                             ],
                             tb_group=thread_block_groups[tbg_id],
                         )
@@ -181,8 +211,8 @@ def allreduce_2nodes(spec: AlgoSpec, thread_block_group_size: int) -> Collective
                                 peer_gpu_id * packets_per_gpu : peer_gpu_id * packets_per_gpu + packets_per_gpu
                             ],
                             scratch_buffers[current_rank_id][
-                                inter_node_offset
-                                + peer_gpu_id * packets_per_gpu : inter_node_offset
+                                broadcast_offset
+                                + peer_gpu_id * packets_per_gpu : broadcast_offset
                                 + peer_gpu_id * packets_per_gpu
                                 + packets_per_gpu
                             ],
@@ -190,3 +220,37 @@ def allreduce_2nodes(spec: AlgoSpec, thread_block_group_size: int) -> Collective
                         )
 
     return prog
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--name", type=str, help="name of the program")
+    parser.add_argument("--num_gpus", type=int, help="total number of gpus")
+    parser.add_argument("--gpus_per_node", type=int, help="number of gpus per node")
+    parser.add_argument("--tbg", type=int, default=1, help="thread block group size")
+    parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
+    parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
+    parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
+
+    args = parser.parse_args()
+
+    spec = AlgoSpec(
+        name=args.name,
+        collective=AllReduce(args.num_gpus, 1, True),
+        nranks_per_node=args.gpus_per_node,
+        world_size=args.num_gpus,
+        in_place=True,
+        instances=1,
+        protocol="LL",
+        auto_sync=False,
+        num_threads_per_block=args.num_threads_per_block,
+        reuse_resources=True,
+        use_double_scratch_buffer=True,
+        min_message_size=args.min_message_size,
+        max_message_size=args.max_message_size,
+    )
+
+    prog = allreduce_multi_nodes(spec, args.tbg)
+    print(prog.to_json())
diff --git a/src/ext/collectives/algorithm_collection_builder.cc b/src/ext/collectives/algorithm_collection_builder.cc
index 2a7e6e91..5d196d12 100644
--- a/src/ext/collectives/algorithm_collection_builder.cc
+++ b/src/ext/collectives/algorithm_collection_builder.cc
@@ -113,7 +113,9 @@ AlgorithmCollection AlgorithmCollectionBuilder::buildDefaultDslAlgorithms(int ra
   };
   static const std::vector<DslAlgoConfig> defaultAlgoConfigs = {
       {"allreduce_2nodes_1K_64K.json", "allreduce", 8, 16, {{"default", 1}}},
-      {"allreduce_2nodes_64K_2M.json", "allreduce", 8, 16, {{"default", 1}}}};
+      {"allreduce_2nodes_128K_2M.json", "allreduce", 8, 16, {{"default", 1}}},
+      {"allreduce_4nodes_1K_64K.json", "allreduce", 8, 32, {{"default", 1}}},
+      {"allreduce_4nodes_128K_2M.json", "allreduce", 4, 64, {{"default", 1}}}};
   AlgorithmCollection collection;
 
   static auto generateFileId = [](const std::string& input) {

From 40295df4c4d0b57a028f29320996f50587330656 Mon Sep 17 00:00:00 2001
From: Caio Rocha <164253795+caiomcbr@users.noreply.github.com>
Date: Thu, 14 May 2026 09:56:11 -0700
Subject: [PATCH 07/12] Adding Support to bf16 Executor Tests (#801)

This pull request adds support for the `bfloat16` (bf16) data type to
the test executor, including both Python and CUDA components. The
changes ensure that `bfloat16` is handled consistently across argument
parsing, data type conversion, and test kernel implementations.
Additionally, the CUDA verification kernels are refactored to use
parameterized tolerances for improved numerical accuracy checks.

**Support for bfloat16 data type:**

* Added handling for `bfloat16`/`bf16` in the Python test executor's
argument parsing, data type conversion (`parse_dtype`,
`dtype_to_mscclpp_dtype`), and help text.
[[1]](diffhunk://#diff-e643968a8622d1603868a8ecf4b2fcd8108be1e404a3420bb7e2a6d51dc23fdcR27-R28)
[[2]](diffhunk://#diff-e643968a8622d1603868a8ecf4b2fcd8108be1e404a3420bb7e2a6d51dc23fdcL122-R135)
[[3]](diffhunk://#diff-e643968a8622d1603868a8ecf4b2fcd8108be1e404a3420bb7e2a6d51dc23fdcL246-R251)
* Updated output to display the correct data type string for `bfloat16`.

**CUDA kernel and test improvements:**

* Included `bfloat16` headers and defined test data fill and gather
kernels for `bfloat16` on both CUDA and HIP platforms.
[[1]](diffhunk://#diff-e18b8becff1c3b234733f5ca3250a76ffdc5edddb302c2da098b64b00ba7cf88R8-R11)
[[2]](diffhunk://#diff-e18b8becff1c3b234733f5ca3250a76ffdc5edddb302c2da098b64b00ba7cf88R35)
[[3]](diffhunk://#diff-e18b8becff1c3b234733f5ca3250a76ffdc5edddb302c2da098b64b00ba7cf88R54-R59)
[[4]](diffhunk://#diff-e18b8becff1c3b234733f5ca3250a76ffdc5edddb302c2da098b64b00ba7cf88R133)
* Refactored verification kernels (`ALL_REDUCE`, `REDUCE_SCATTER`) to
use an explicit tolerance parameter (`Eps`) and added correct tolerances
for each data type, including `bfloat16`.
[[1]](diffhunk://#diff-e18b8becff1c3b234733f5ca3250a76ffdc5edddb302c2da098b64b00ba7cf88L69-R85)
[[2]](diffhunk://#diff-e18b8becff1c3b234733f5ca3250a76ffdc5edddb302c2da098b64b00ba7cf88L94-R113)

These changes ensure full support for `bfloat16` in the test executor
and improve the accuracy and maintainability of the CUDA test kernels.

---------

Co-authored-by: Caio Rocha <caiorocha@microsof.com>
---
 python/test/executor_test.py          | 21 ++++++++++-------
 python/test/executor_test_verifier.cu | 33 +++++++++++++++++++--------
 2 files changed, 36 insertions(+), 18 deletions(-)

diff --git a/python/test/executor_test.py b/python/test/executor_test.py
index 59bc1661..8a309de5 100644
--- a/python/test/executor_test.py
+++ b/python/test/executor_test.py
@@ -24,6 +24,8 @@ def parse_dtype(dtype_str):
     dtype_str = dtype_str.strip().lower()
     if dtype_str == "float16":
         return cp.float16
+    elif dtype_str in ("bfloat16", "bf16"):
+        return cp.float16  # same 2-byte size; mscclpp DataType is resolved from dtype_str
     elif dtype_str == "float32":
         return cp.float32
     elif dtype_str == "int32":
@@ -119,15 +121,18 @@ def parse_size(size_str):
         return int(size_str)
 
 
-def dtype_to_mscclpp_dtype(dtype):
-    if dtype == cp.float16:
+def dtype_to_mscclpp_dtype(dtype_str):
+    dtype_str = dtype_str.strip().lower()
+    if dtype_str == "float16":
         return DataType.float16
-    elif dtype == cp.float32:
+    elif dtype_str in ("bfloat16", "bf16"):
+        return DataType.bfloat16
+    elif dtype_str == "float32":
         return DataType.float32
-    elif dtype == cp.int32:
+    elif dtype_str == "int32":
         return DataType.int32
     else:
-        raise ValueError(f"Unknown data type: {dtype}")
+        raise ValueError(f"Unknown data type: {dtype_str}")
 
 
 def build_bufs(
@@ -205,7 +210,7 @@ def main(
         result_buf.data.ptr,
         input_buf.nbytes,
         result_buf.nbytes,
-        dtype_to_mscclpp_dtype(dtype),
+        dtype_to_mscclpp_dtype(dtype_str),
         execution_plan,
         stream.ptr,
         packet_type,
@@ -231,7 +236,7 @@ def main(
         npkit.shutdown()
     print(
         f"Rank: {mscclpp_group.my_rank} Execution time: {execution_time} us, "
-        f"data size: {result_buf.nbytes} bytes data type: {dtype().dtype.name} "
+        f"data size: {result_buf.nbytes} bytes data type: {dtype_str} "
         f"packet type: {packet_type}"
     )
     executor = None
@@ -243,7 +248,7 @@ if __name__ == "__main__":
     parser.add_argument("-path", "--execution_plan_path", type=str, required=True)
     parser.add_argument("--size", type=str, required=True)
     parser.add_argument("--in_place", action="store_true", help="flag to define an in-place operation")
-    parser.add_argument("--dtype", type=str, default="float16", help="Choose from float16, float32, int32")
+    parser.add_argument("--dtype", type=str, default="float16", help="Choose from float16, bfloat16, float32, int32")
     parser.add_argument("--packet_type", type=str, default="LL16", help="Choose from LL8, LL16")
     parser.add_argument("--n_iters", type=int, default=10)
     parser.add_argument("--n_graph_iters", type=int, default=10)
diff --git a/python/test/executor_test_verifier.cu b/python/test/executor_test_verifier.cu
index cf3cd4a6..e7749197 100644
--- a/python/test/executor_test_verifier.cu
+++ b/python/test/executor_test_verifier.cu
@@ -4,8 +4,10 @@
 #include <assert.h>
 
 #if defined(__HIP_PLATFORM_AMD__)
+#include <hip/hip_bfloat16.h>
 #include <hip/hip_fp16.h>
 #else
+#include <cuda_bf16.h>
 #include <cuda_fp16.h>
 #endif
 
@@ -30,6 +32,7 @@ static __device__ unsigned int ranqd1(unsigned int seed) {
     }                                                                                                    \
   }
 
+FILL_DATA(bfloat16, __nv_bfloat16)
 FILL_DATA(float16, __half)
 FILL_DATA(float32, float)
 FILL_DATA(int32, int)
@@ -48,11 +51,12 @@ FILL_DATA(int32, int)
     }                                                                                                      \
   }
 
+TEST_DATA_ALL_GATHER(bfloat16, __nv_bfloat16)
 TEST_DATA_ALL_GATHER(float16, __half)
 TEST_DATA_ALL_GATHER(float32, float)
 TEST_DATA_ALL_GATHER(int32, int)
 
-#define TEST_DATA_ALL_REDUCE(FuncNameType, DataType)                                                       \
+#define TEST_DATA_ALL_REDUCE(FuncNameType, DataType, Eps)                                                  \
   extern "C" __global__ void __launch_bounds__(1024, 1) test_data_all_reduce_##FuncNameType(               \
       DataType* result_buf, DataType* test_buf, size_t num_elems, int num_ranks, int my_rank, int seq) {   \
     for (int rank = 0; rank < num_ranks; rank++) {                                                         \
@@ -66,15 +70,19 @@ TEST_DATA_ALL_GATHER(int32, int)
       }                                                                                                    \
     }                                                                                                      \
     for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < num_elems; i += blockDim.x * gridDim.x) {   \
-      assert(abs(float(result_buf[i]) - float(test_buf[i])) < 1e-3 * num_ranks);                           \
+      float expected = float(test_buf[i]);                                                                 \
+      float result = float(result_buf[i]);                                                                 \
+      float tol = Eps * num_ranks * (1.0f + abs(expected));                                                \
+      assert(abs(result - expected) <= tol);                                                               \
     }                                                                                                      \
   }
 
-TEST_DATA_ALL_REDUCE(float16, __half)
-TEST_DATA_ALL_REDUCE(float32, float)
-TEST_DATA_ALL_REDUCE(int32, int)
+TEST_DATA_ALL_REDUCE(bfloat16, __nv_bfloat16, 7.8125e-3f)
+TEST_DATA_ALL_REDUCE(float16, __half, 9.765625e-4f)
+TEST_DATA_ALL_REDUCE(float32, float, 1.1920929e-7f)
+TEST_DATA_ALL_REDUCE(int32, int, 0.0f)
 
-#define TEST_DATA_REDUCE_SCATTER(FuncNameType, DataType)                                                   \
+#define TEST_DATA_REDUCE_SCATTER(FuncNameType, DataType, Eps)                                              \
   extern "C" __global__ void __launch_bounds__(1024, 1) test_data_reduce_scatter_##FuncNameType(           \
       DataType* result_buf, DataType* test_buf, size_t num_elems, int num_ranks, int my_rank, int seq) {   \
     int nem_elems_per_rank = num_elems / num_ranks;                                                        \
@@ -91,14 +99,18 @@ TEST_DATA_ALL_REDUCE(int32, int)
     }                                                                                                      \
     for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < num_elems; i += blockDim.x * gridDim.x) {   \
       if (i >= offset && i < offset + nem_elems_per_rank) {                                                \
-        assert(abs(float(result_buf[i - offset]) - float(test_buf[i])) < 1e-3 * num_ranks);                \
+        float expected = float(test_buf[i]);                                                               \
+        float result = float(result_buf[i - offset]);                                                      \
+        float tol = Eps * num_ranks * (1.0f + abs(expected));                                              \
+        assert(abs(result - expected) <= tol);                                                             \
       }                                                                                                    \
     }                                                                                                      \
   }
 
-TEST_DATA_REDUCE_SCATTER(float16, __half)
-TEST_DATA_REDUCE_SCATTER(float32, float)
-TEST_DATA_REDUCE_SCATTER(int32, int)
+TEST_DATA_REDUCE_SCATTER(bfloat16, __nv_bfloat16, 7.8125e-3f)
+TEST_DATA_REDUCE_SCATTER(float16, __half, 9.765625e-4f)
+TEST_DATA_REDUCE_SCATTER(float32, float, 1.1920929e-7f)
+TEST_DATA_REDUCE_SCATTER(int32, int, 0.0f)
 
 #define TEST_DATA_ALL_TO_ALL(FuncNameType, DataType)                                                       \
   extern "C" __global__ void __launch_bounds__(1024, 1) test_data_all_to_all_##FuncNameType(               \
@@ -118,6 +130,7 @@ TEST_DATA_REDUCE_SCATTER(int32, int)
     }                                                                                                      \
   }
 
+TEST_DATA_ALL_TO_ALL(bfloat16, __nv_bfloat16)
 TEST_DATA_ALL_TO_ALL(float16, __half)
 TEST_DATA_ALL_TO_ALL(float32, float)
 TEST_DATA_ALL_TO_ALL(int32, int)
\ No newline at end of file

From 5d608feaa52f83e7b556fde647a45b63eb119047 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Thu, 14 May 2026 14:06:12 -0700
Subject: [PATCH 08/12] Enhance cross-node CudaIpc availability check (#803)

---
 include/mscclpp/utils.hpp          |  9 +++++++++
 src/core/connection.cc             | 11 +++++++++++
 src/core/context.cc                | 17 ++++++++--------
 src/core/gpu_ipc_mem.cc            | 27 ++++++++++++++++++++++----
 test/mp_unit/port_channel_tests.cu | 31 +++++++++++++++++++++++++++---
 5 files changed, 79 insertions(+), 16 deletions(-)

diff --git a/include/mscclpp/utils.hpp b/include/mscclpp/utils.hpp
index ffe269da..54cfa4a0 100644
--- a/include/mscclpp/utils.hpp
+++ b/include/mscclpp/utils.hpp
@@ -46,6 +46,15 @@ std::string getIBDeviceName(Transport ibTransport);
 /// @return The InfiniBand transport associated with the specified device name.
 Transport getIBTransportByDeviceName(const std::string& ibDeviceName);
 
+/// Check whether this process can allocate/import CUDA memory with NVIDIA fabric handles
+/// (`CU_MEM_HANDLE_TYPE_FABRIC`). Fabric handles enable cross-node `Transport::CudaIpc` on
+/// MNNVL systems (e.g., GB200 NVL72) when the IMEX service is running. Returns `false` on
+/// hardware/software stacks without MNNVL+IMEX, in which case `Transport::CudaIpc` is
+/// restricted to ranks within the same node.
+///
+/// @return `true` if fabric handles are usable from this process, `false` otherwise.
+bool isFabricMemHandleAvailable();
+
 }  // namespace mscclpp
 
 #endif  // MSCCLPP_UTILS_HPP_
diff --git a/src/core/connection.cc b/src/core/connection.cc
index 8b6c0afb..11ecd968 100644
--- a/src/core/connection.cc
+++ b/src/core/connection.cc
@@ -82,6 +82,17 @@ MSCCLPP_API_CPP int Connection::getMaxWriteQueueSize() const { return impl_->get
 CudaIpcConnection::CudaIpcConnection(std::shared_ptr<Context> context, const Endpoint& localEndpoint,
                                      const Endpoint& remoteEndpoint)
     : BaseConnection(context, localEndpoint) {
+  // Log fabric/MNNVL availability exactly once per process so any later cross-node CudaIpc failure
+  // is easy to triage. C++11 magic statics make this thread-safe without an explicit mutex.
+  // NOTE: assigning the message to a std::string first avoids the logger's pointer-formatting
+  // overload from kicking in on the const char* result of the ternary.
+  [[maybe_unused]] static const bool fabricAvailable_ = []() {
+    const bool avail = isFabricMemHandleAvailable();
+    const std::string status = avail ? "available (cross-node CudaIpc via MNNVL/IMEX is supported)"
+                                     : "NOT available (CudaIpc is restricted to intra-node ranks on this system)";
+    INFO(CONN, "CudaIpc transport selected: fabric handles ", status);
+    return avail;
+  }();
   if (localEndpoint.transport() != Transport::CudaIpc || remoteEndpoint.transport() != Transport::CudaIpc) {
     THROW(CONN, Error, ErrorCode::InternalError, "CudaIpc transport is required for CudaIpcConnection");
   }
diff --git a/src/core/context.cc b/src/core/context.cc
index aabe71df..b55939e3 100644
--- a/src/core/context.cc
+++ b/src/core/context.cc
@@ -4,12 +4,11 @@
 #include "context.hpp"
 
 #include <mscclpp/env.hpp>
-#include <sstream>
 
 #include "api.h"
 #include "connection.hpp"
-#include "debug.h"
 #include "endpoint.hpp"
+#include "logger.hpp"
 #include "registered_memory.hpp"
 
 namespace mscclpp {
@@ -78,19 +77,17 @@ MSCCLPP_API_CPP Endpoint Context::createEndpoint(EndpointConfig config) {
 
 MSCCLPP_API_CPP Connection Context::connect(const Endpoint& localEndpoint, const Endpoint& remoteEndpoint) {
   if (localEndpoint.device().type == DeviceType::GPU && localEndpoint.device().id < 0) {
-    throw Error("No GPU device ID provided for local endpoint", ErrorCode::InvalidUsage);
+    THROW(CONN, Error, ErrorCode::InvalidUsage, "No GPU device ID provided for local endpoint");
   }
   if (remoteEndpoint.device().type == DeviceType::GPU && remoteEndpoint.device().id < 0) {
-    throw Error("No GPU device ID provided for remote endpoint", ErrorCode::InvalidUsage);
+    THROW(CONN, Error, ErrorCode::InvalidUsage, "No GPU device ID provided for remote endpoint");
   }
   auto localTransport = localEndpoint.transport();
   auto remoteTransport = remoteEndpoint.transport();
   if (localTransport != remoteTransport &&
       !(AllIBTransports.has(localTransport) && AllIBTransports.has(remoteTransport))) {
-    std::stringstream ss;
-    ss << "Transport mismatch between local (" << localTransport << ") and remote (" << remoteEndpoint.transport()
-       << ") endpoints";
-    throw Error(ss.str(), ErrorCode::InvalidUsage);
+    THROW(CONN, Error, ErrorCode::InvalidUsage, "Transport mismatch between local (", localTransport, ") and remote (",
+          remoteTransport, ") endpoints");
   }
   std::shared_ptr<BaseConnection> conn;
   if (localTransport == Transport::CudaIpc) {
@@ -100,7 +97,9 @@ MSCCLPP_API_CPP Connection Context::connect(const Endpoint& localEndpoint, const
   } else if (localTransport == Transport::Ethernet) {
     conn = std::make_shared<EthernetConnection>(shared_from_this(), localEndpoint, remoteEndpoint);
   } else {
-    throw Error("Unsupported transport", ErrorCode::InternalError);
+    THROW(CONN, Error, ErrorCode::InternalError, "Unsupported transport: ", localTransport,
+          " (this usually means EndpointConfig.transport was left at Transport::Unknown — "
+          "set it explicitly to CudaIpc, an IB transport, or Ethernet)");
   }
   return Connection(conn);
 }
diff --git a/src/core/gpu_ipc_mem.cc b/src/core/gpu_ipc_mem.cc
index c863ecdd..0f58ed20 100644
--- a/src/core/gpu_ipc_mem.cc
+++ b/src/core/gpu_ipc_mem.cc
@@ -7,6 +7,7 @@
 
 #include <cstring>
 #include <mscclpp/gpu_utils.hpp>
+#include <mscclpp/utils.hpp>
 
 #include "logger.hpp"
 #include "unix_socket.hpp"
@@ -35,7 +36,7 @@ std::ostream& operator<<(std::ostream& os, const GpuIpcMemHandle::TypeFlags& typ
   return os;
 }
 
-[[maybe_unused]] static bool isFabricMemHandleAvailable() {
+bool isFabricMemHandleAvailable() {
 #if (CUDA_NVLS_API_AVAILABLE)
   static int resultCache = -1;  // -1: uninitialized, 0: not available, 1: available
   if (resultCache != -1) {
@@ -283,11 +284,19 @@ GpuIpcMem::GpuIpcMem(const GpuIpcMemHandle& handle)
     THROW(GPU, Error, ErrorCode::InvalidUsage, "GpuIpcMemHandle type is None, cannot create GpuIpcMem");
   }
   if ((type_ == GpuIpcMemHandle::Type::None) && (handle_.typeFlags & GpuIpcMemHandle::Type::Fabric)) {
-    if (cuMemImportFromShareableHandle(&allocHandle_, (void*)handle_.fabric.handle, CU_MEM_HANDLE_TYPE_FABRIC) ==
-        CUDA_SUCCESS) {
+    CUresult res =
+        cuMemImportFromShareableHandle(&allocHandle_, (void*)handle_.fabric.handle, CU_MEM_HANDLE_TYPE_FABRIC);
+    if (res == CUDA_SUCCESS) {
       // Ignore allocHandle in the handle struct since it is process-local and not transferable across processes.
       handle_.fabric.allocHandle = {};
       type_ = GpuIpcMemHandle::Type::Fabric;
+    } else {
+      const char* errStr = nullptr;
+      (void)cuGetErrorString(res, &errStr);
+      const std::string errMsg = errStr ? std::string(errStr) : std::string("unknown CUDA error");
+      WARN(GPU, "Fabric IPC handle import failed (", errMsg,
+           "); cross-node CudaIpc requires NVIDIA MNNVL hardware and a running IMEX service. ",
+           "Falling back to other handle types if available.");
     }
   }
   if ((type_ == GpuIpcMemHandle::Type::None) && (handle_.typeFlags & GpuIpcMemHandle::Type::PosixFd)) {
@@ -303,7 +312,17 @@ GpuIpcMem::GpuIpcMem(const GpuIpcMemHandle& handle)
     type_ = GpuIpcMemHandle::Type::RuntimeIpc;
   }
   if (type_ == GpuIpcMemHandle::Type::None) {
-    THROW(GPU, Error, ErrorCode::Aborted, "Failed to open GpuIpcMemHandle (type: ", handle_.typeFlags, ")");
+    const bool fabricOnly = (handle_.typeFlags == GpuIpcMemHandle::Type::Fabric);
+    const std::string hint = fabricOnly
+                                 ? std::string(
+                                       "The remote rank sent only a Fabric (MNNVL) handle, but this rank could not "
+                                       "import it. Check that the IMEX daemon is running on both nodes and that the "
+                                       "GPUs share an NVLink fabric.")
+                                 : std::string(
+                                       "All handle types failed to import; check IMEX service and POSIX FD socket "
+                                       "availability.");
+    THROW(GPU, Error, ErrorCode::Aborted, "Failed to open GpuIpcMemHandle (offered types: ", handle_.typeFlags, "). ",
+          hint);
   }
 }
 
diff --git a/test/mp_unit/port_channel_tests.cu b/test/mp_unit/port_channel_tests.cu
index 3b14ed31..4b1b0cfb 100644
--- a/test/mp_unit/port_channel_tests.cu
+++ b/test/mp_unit/port_channel_tests.cu
@@ -36,6 +36,18 @@ inline void requireGdrForIbMode(IbMode mode, mscclpp::Transport ibTransport) {
 #define REQUIRE_GDR_FOR_IB_MODE(mode)  // No extra requirements on non-CUDA platforms.
 #endif
 
+// Skip an IPC-only PortChannel test (useIPC=true, useIB=false, useEthernet=false) when CudaIpc
+// cannot connect this rank pair. CudaIpc works intra-node always, and cross-node only on MNNVL
+// systems (GB200 NVL72 + IMEX). The combined check is "at least 2 ranks per node" OR "fabric
+// (MNNVL) handles are usable on this system".
+#define REQUIRE_CUDA_IPC_AVAILABLE                                           \
+  do {                                                                       \
+    if (gEnv->nRanksPerNode < 2 && !mscclpp::isFabricMemHandleAvailable()) { \
+      SKIP_TEST() << "CudaIpc requires intra-node ranks (nRanksPerNode>=2) or MNNVL fabric handles, \
+both unavailable here.";                                                     \
+    }                                                                        \
+  } while (0)
+
 void PortChannelOneToOneTest::SetUp() {
   // Use only two ranks
   setNumRanksToUse(2);
@@ -71,7 +83,10 @@ void PortChannelOneToOneTest::setupMeshConnections(std::vector<mscclpp::PortChan
       continue;
     }
     mscclpp::EndpointConfig cfg;
-    if ((rankToNode(r) == rankToNode(gEnv->rank)) && useIPC) {
+    if (useIPC) {
+      // CudaIpc works intra-node always, and cross-node on MNNVL systems (GB200 NVL72 + IMEX)
+      // via fabric handles. Tests that exercise CudaIpc across nodes on non-MNNVL hardware should
+      // gate themselves with REQUIRE_CUDA_IPC_AVAILABLE; we always request CudaIpc here when asked.
       cfg.transport = mscclpp::Transport::CudaIpc;
     } else if (useIb) {
       cfg.transport = ibTransport;
@@ -262,6 +277,7 @@ void PortChannelOneToOneTest::testPingPongPerf(PingPongTestParams params) {
 }
 
 TEST(PortChannelOneToOneTest, PingPong) {
+  REQUIRE_CUDA_IPC_AVAILABLE;
   testPingPong(PingPongTestParams{
       .useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Default});
 }
@@ -279,6 +295,7 @@ TEST(PortChannelOneToOneTest, PingPongEthernet) {
 }
 
 TEST(PortChannelOneToOneTest, PingPongWithPoll) {
+  REQUIRE_CUDA_IPC_AVAILABLE;
   testPingPong(PingPongTestParams{
       .useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = true, .ibMode = IbMode::Default});
 }
@@ -291,6 +308,7 @@ TEST(PortChannelOneToOneTest, PingPongIbHostModeWithPoll) {
 }
 
 PERF_TEST(PortChannelOneToOneTest, PingPongPerf) {
+  REQUIRE_CUDA_IPC_AVAILABLE;
   testPingPongPerf(PingPongTestParams{
       .useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Default});
 }
@@ -482,7 +500,10 @@ void PortChannelOneToOneTest::testPacketPingPongPerf(bool useIb, IbMode ibMode)
   proxyService->stopProxy();
 }
 
-TEST(PortChannelOneToOneTest, PacketPingPong) { testPacketPingPong(false, IbMode::Default); }
+TEST(PortChannelOneToOneTest, PacketPingPong) {
+  REQUIRE_CUDA_IPC_AVAILABLE;
+  testPacketPingPong(false, IbMode::Default);
+}
 
 TEST(PortChannelOneToOneTest, PacketPingPongIbHostMode) {
   REQUIRE_IBVERBS;
@@ -490,7 +511,10 @@ TEST(PortChannelOneToOneTest, PacketPingPongIbHostMode) {
   testPacketPingPong(true, IbMode::Host);
 }
 
-PERF_TEST(PortChannelOneToOneTest, PacketPingPongPerf) { testPacketPingPongPerf(false, IbMode::Default); }
+PERF_TEST(PortChannelOneToOneTest, PacketPingPongPerf) {
+  REQUIRE_CUDA_IPC_AVAILABLE;
+  testPacketPingPongPerf(false, IbMode::Default);
+}
 
 PERF_TEST(PortChannelOneToOneTest, PacketPingPongPerfIbHostMode) {
   REQUIRE_IBVERBS;
@@ -583,6 +607,7 @@ void PortChannelOneToOneTest::testBandwidth(PingPongTestParams params) {
 }
 
 PERF_TEST(PortChannelOneToOneTest, Bandwidth) {
+  REQUIRE_CUDA_IPC_AVAILABLE;
   testBandwidth(PingPongTestParams{
       .useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Default});
 }

From 5911998181a60eeb546300843bb1f76f504df97d Mon Sep 17 00:00:00 2001
From: Qinghua Zhou <qinghuazhou@microsoft.com>
Date: Thu, 14 May 2026 21:29:10 +0000
Subject: [PATCH 09/12] ext/ep: gate NVLS HT B2 on cross-host fabric IPC
 support (H100 fix)

The NVLS HT B2 path introduced in 3ab2e43b activated whenever
isNvlsSupported() && num_rdma_ranks > 1. On H100 NDv5 / Azure CX-7 RoCE
that is true (H100 has intra-node NVLink multicast), but there is no
cross-host NVSwitch fabric. mscclpp's GpuIpcMem::create then falls back
to CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR whose handle exchange routes
through /tmp/mscclpp_bootstrap_<pid>.sock -- a master-rank-0 unix-domain
socket worker ranks cannot reach. Symptom on every commit since 3ab2e43b:

    RuntimeError: connect() failed for unix socket to
    /tmp/mscclpp_bootstrap_<pid>.sock

MSCCLPP_EP_FABRIC_IPC=0 was being silently ignored.

src/ext/ep/buffer.cc: add resolve_fabric_ipc_supported() helper.
Resolution:
  1. MSCCLPP_EP_FABRIC_IPC env var (0/off/false/no => off,
     1/on/true/yes/force => on, otherwise auto).
  2. Auto-detect: requires both
       - CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED == 1
       - device compute capability >= sm_100 (Blackwell+).

Gate both use_fabric_ipc_alloc (RDMA buffer allocator) and nvls_ht_enabled
(HT B2 multicast region) on fabric_ipc_supported. On H100 both fall back
to cudaMalloc + legacy PortChannel; on GB200 NVL72 both remain enabled.
Diagnostic prints now show fabric_ipc=.

test/python/ext/ep/test_internode_multirank.py: replace hardcoded
NUM_MAX_NVL_PEERS=4 with a runtime _detect_local_world_size() helper
that reads MSCCLPP_EP_LOCAL_WORLD_SIZE / LOCAL_WORLD_SIZE /
OMPI_COMM_WORLD_LOCAL_SIZE, falling back to torch.cuda.device_count().
Makes the test correct on both H100 (8 GPUs/node) and GB200 (4 GPUs/node)
without code changes.

src/core/atomicadd_kernel.cu: use cuCtxCreate_v4 for CUDA >= 12.5 (the
underlying symbol was renamed); preserve legacy 3-arg cuCtxCreate for
older toolkits.

Verified on 2x H100 NDv5 at HEAD:
  LL intranode  (8 GPUs)            PASS
  LL internode  (16 GPUs, 2 nodes)  PASS
  HT intranode  (8 GPUs)            PASS
  HT internode  (16 GPUs, 2 nodes)  PASS

Diagnostic on H100:
  [mscclpp_ep] rdma_buffer allocator: cudaMalloc (low_latency=0, nvls=1, fabric_ipc=0)
  [mscclpp_ep] NVLS HT multicast: disabled (low_latency=0, num_rdma_ranks=2, nvls_supported=1, fabric_ipc=0)
---
 src/core/atomicadd_kernel.cu                  |  4 +-
 src/ext/ep/buffer.cc                          | 79 +++++++++++++++++--
 .../python/ext/ep/test_internode_multirank.py | 22 +++++-
 3 files changed, 94 insertions(+), 11 deletions(-)

diff --git a/src/core/atomicadd_kernel.cu b/src/core/atomicadd_kernel.cu
index 59e660d7..21427df6 100644
--- a/src/core/atomicadd_kernel.cu
+++ b/src/core/atomicadd_kernel.cu
@@ -41,9 +41,9 @@ void CudaIpcStream::atomicAdd(uint64_t* dst, int64_t value) {
     // 4-arg form on new toolkits, fall back to the legacy 3-arg form on
     // CUDA < 12.5 so we keep compiling against older drivers/toolkits.
 #if CUDA_VERSION >= 12050
-    res = cuCtxCreate(&proxyAtomicCtx_, NULL, 0, cuDevice);
+    res = cuCtxCreate_v4(&proxyAtomicCtx_, NULL, 0, cuDevice);
 #else
-    res = cuCtxCreate(&proxyAtomicCtx_, 0, cuDevice);
+    res = cuCtxCreate(&proxyAtomicCtx_vice);
 #endif
     if (res != CUDA_SUCCESS) throw Error("cuCtxCreate failed", ErrorCode::InternalError);
 
diff --git a/src/ext/ep/buffer.cc b/src/ext/ep/buffer.cc
index 74aa7922..02b38c36 100644
--- a/src/ext/ep/buffer.cc
+++ b/src/ext/ep/buffer.cc
@@ -59,6 +59,56 @@ static int resolve_num_proxy_services() {
   return 8;
 }
 
+// Cross-host cuMem fabric IPC capability.
+//
+// `isNvlsSupported()` returns true for any device with NVLink multicast,
+// including H100. But NVLS by itself only works inside one host's NVLink
+// island; cross-host sharing of cuMem allocations / NVLS multicast handles
+// requires the device to be on an actual cross-host NVLink fabric (GB200
+// NVL72 with nvidia-imex on Azure today). H100 reports
+// `CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED == 1` too but lacks
+// the NVSwitch fabric to actually share fabric handles across hosts; the
+// cross-host import then falls back to POSIX-FD, whose handle exchange
+// routes through a unix-domain socket on the master host -- which
+// worker-node ranks cannot reach (`connect() failed for unix socket to
+// /tmp/mscclpp_bootstrap_*.sock`). That is the exact failure signature
+// commit 3ab2e43b ("NVLS HT B2 phases 1-3") introduced on H100 / Azure
+// CX-7 RoCE.
+//
+// To stay safe by default we require both:
+//   - device attr `CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED`
+//   - compute capability >= sm_100 (Blackwell+).
+//
+// Resolution order:
+//   1. `MSCCLPP_EP_FABRIC_IPC` env var (`0`/`off`/`false`/`no` => off,
+//      `1`/`on`/`true`/`yes`/`force` => on, anything else => auto). When
+//      set, env value takes precedence over the device check.
+//   2. Auto-detect via the two checks above.
+static bool resolve_fabric_ipc_supported() {
+  if (const char* env = std::getenv("MSCCLPP_EP_FABRIC_IPC")) {
+    std::string v(env);
+    for (auto& c : v) c = std::tolower(static_cast<unsigned char>(c));
+    if (v == "0" || v == "off" || v == "false" || v == "no") return false;
+    if (v == "1" || v == "on" || v == "true" || v == "yes" || v == "force") return true;
+  }
+  int dev = 0;
+  if (cudaGetDevice(&dev) != cudaSuccess) return false;
+  CUdevice cuDev;
+  if (cuDeviceGet(&cuDev, dev) != CUDA_SUCCESS) return false;
+  int supported = 0;
+  if (cuDeviceGetAttribute(&supported, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, cuDev) != CUDA_SUCCESS) {
+    return false;
+  }
+  if (!supported) return false;
+  cudaDeviceProp prop{};
+  if (cudaGetDeviceProperties(&prop, dev) != cudaSuccess) return false;
+  // Blackwell+ (sm_100, GB200 NVL72) is the only deployed cross-host
+  // NVLink fabric today. H100 (sm_90) advertises fabric-handle support
+  // but lacks the nvidia-imex / NVSwitch fabric to actually share them
+  // across hosts.
+  return prop.major >= 10;
+}
+
 Buffer::Buffer(int rank, int num_ranks, int64_t num_nvl_bytes, int64_t num_rdma_bytes, bool low_latency_mode)
     : rank(rank),
       num_ranks(num_ranks),
@@ -346,8 +396,16 @@ void Buffer::sync(const std::vector<int>& device_ids,
     // fabric-IPC so cross-node `handle.put(data)` can be replaced by
     // direct kernel-side writes via NVL72 fabric pointers — bypassing
     // the broken Azure CX-7 RoCE RDMA WRITE path.
+    //
+    // NVLS-supported but FABRIC-unsupported deployments (e.g. H100 on
+    // Azure NDv5) must not take this path for the HT cross-host case:
+    // their `gpuCallocPhysical` result is a POSIX-FD-only handle whose
+    // cross-host import falls back to a master-local unix socket which
+    // worker ranks cannot reach.
+    static const bool fabric_ipc_supported = resolve_fabric_ipc_supported();
     const bool use_fabric_ipc_alloc =
-        mscclpp::isNvlsSupported() && (low_latency_mode || num_rdma_ranks > 1);
+        mscclpp::isNvlsSupported() && fabric_ipc_supported &&
+        (low_latency_mode || num_rdma_ranks > 1);
     if (use_fabric_ipc_alloc) {
       rdma_buffer_ptr = mscclpp::detail::gpuCallocPhysical(num_rdma_bytes);
       CUDA_CHECK(cudaMemset(rdma_buffer_ptr, 0, num_rdma_bytes));
@@ -356,9 +414,9 @@ void Buffer::sync(const std::vector<int>& device_ids,
       CUDA_CHECK(cudaMemset(rdma_buffer_ptr, 0, num_rdma_bytes));
     }
     if (rank == 0) {
-      printf("[mscclpp_ep] rdma_buffer allocator: %s (low_latency=%d, nvls=%d)\n",
+      printf("[mscclpp_ep] rdma_buffer allocator: %s (low_latency=%d, nvls=%d, fabric_ipc=%d)\n",
              use_fabric_ipc_alloc ? "gpuCallocPhysical (fabric-IPC)" : "cudaMalloc",
-             (int)low_latency_mode, (int)mscclpp::isNvlsSupported());
+             (int)low_latency_mode, (int)mscclpp::isNvlsSupported(), (int)fabric_ipc_supported);
       fflush(stdout);
     }
     bootstrap->barrier();
@@ -501,12 +559,20 @@ void Buffer::sync(const std::vector<int>& device_ids,
     // or there is only one RDMA rank (intranode-only), `nvls_ht_enabled`
     // stays `false` and kernels select the legacy PortChannel path.
     //
+    // Additionally: NVLS alone is insufficient for cross-host. H100 has
+    // NVLS within a node but no cross-host NVSwitch fabric, so
+    // `connectNvlsCollective` either fails or builds a per-node-only
+    // multicast object, and the POSIX-FD fallback for handle exchange
+    // routes through a master-local unix socket that worker ranks cannot
+    // reach. Gate this path on `fabric_ipc_supported` so non-Blackwell
+    // deployments cleanly use the legacy PortChannel path.
+    //
     // Skipped for `low_latency_mode` since LL has its own (working)
     // fabric-IPC path via Proposal A and does not use the HT counter
     // protocol.
     // ------------------------------------------------------------------
     nvls_ht_enabled = false;
-    if (!low_latency_mode && num_rdma_ranks > 1 && mscclpp::isNvlsSupported()) {
+    if (!low_latency_mode && num_rdma_ranks > 1 && mscclpp::isNvlsSupported() && fabric_ipc_supported) {
       // Worst-case sizing — chosen so the same multicast buffer fits any
       // (num_sms, num_rdma_ranks) configuration the kernels may launch with.
       const size_t kCounterBytesPerChannel =
@@ -576,8 +642,9 @@ void Buffer::sync(const std::vector<int>& device_ids,
     } else if (rank == 0) {
       printf(
           "[mscclpp_ep] NVLS HT multicast: disabled (low_latency=%d, num_rdma_ranks=%d, "
-          "nvls_supported=%d)\n",
-          (int)low_latency_mode, num_rdma_ranks, (int)mscclpp::isNvlsSupported());
+          "nvls_supported=%d, fabric_ipc=%d)\n",
+          (int)low_latency_mode, num_rdma_ranks, (int)mscclpp::isNvlsSupported(),
+          (int)fabric_ipc_supported);
       fflush(stdout);
     }
 
diff --git a/test/python/ext/ep/test_internode_multirank.py b/test/python/ext/ep/test_internode_multirank.py
index eddfb6be..b7ae2e8d 100644
--- a/test/python/ext/ep/test_internode_multirank.py
+++ b/test/python/ext/ep/test_internode_multirank.py
@@ -41,10 +41,26 @@ import torch
 import torch.distributed as dist
 
 
+def _detect_local_world_size():
+    """Number of GPUs per node (4 on GB200, 8 on H100/A100, etc.).
+
+    Resolution order:
+      1. `MSCCLPP_EP_LOCAL_WORLD_SIZE` env var (matches the C++ side).
+      2. `LOCAL_WORLD_SIZE` (torchrun) or `OMPI_COMM_WORLD_LOCAL_SIZE` (mpirun).
+      3. `torch.cuda.device_count()` on the current host.
+    """
+    for var in ("MSCCLPP_EP_LOCAL_WORLD_SIZE", "LOCAL_WORLD_SIZE", "OMPI_COMM_WORLD_LOCAL_SIZE"):
+        v = os.environ.get(var)
+        if v and int(v) > 0:
+            return int(v)
+    return max(1, torch.cuda.device_count())
+
+
 def init_dist():
     rank = int(os.environ["RANK"])
     world_size = int(os.environ["WORLD_SIZE"])
-    local_rank = int(os.environ.get("LOCAL_RANK", rank % 4))
+    local_world_size = _detect_local_world_size()
+    local_rank = int(os.environ.get("LOCAL_RANK", rank % local_world_size))
     torch.cuda.set_device(local_rank)
     dist.init_process_group(
         backend="nccl", world_size=world_size, rank=rank, device_id=torch.device(f"cuda:{local_rank}")
@@ -71,10 +87,10 @@ def main():
     rank, num_ranks, local_rank, group = init_dist()
     from mscclpp.ext import ep
 
-    NUM_MAX_NVL_PEERS = 4
+    NUM_MAX_NVL_PEERS = _detect_local_world_size()
     assert (
         num_ranks % NUM_MAX_NVL_PEERS == 0 and num_ranks > NUM_MAX_NVL_PEERS
-    ), f"expected >1 node with 8 GPUs each, got num_ranks={num_ranks}"
+    ), f"expected >1 node with {NUM_MAX_NVL_PEERS} GPUs each, got num_ranks={num_ranks}"
     num_nodes = num_ranks // NUM_MAX_NVL_PEERS
     num_local_ranks = NUM_MAX_NVL_PEERS
 

From 252a422030c092163b8c75577b7223ca264f2443 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Fri, 15 May 2026 11:50:43 -0700
Subject: [PATCH 10/12] Handle PortChannel flush asynchronously from the host
 proxy (#802)

When a `PortChannel` requests `flush()`, the host-side proxy was being
blocked, which may cause head-of-line blocking of other parallel
`PortChannel`s' requests. Now the proxy handles `flush()` requests
asynchronously. This feature especially helps performance when we need
multiple IB QPs and need to flush QPs.
---
 include/mscclpp/core.hpp                |   2 +
 include/mscclpp/fifo.hpp                |   5 +
 include/mscclpp/port_channel.hpp        |   4 +
 include/mscclpp/port_channel_device.hpp |  33 ++-
 include/mscclpp/proxy.hpp               |   4 +-
 src/core/connection.cc                  |  36 ++-
 src/core/fifo.cc                        |   2 +
 src/core/include/atomic.hpp             |  16 +-
 src/core/include/connection.hpp         |  25 ++
 src/core/include/proxy_impl.hpp         |  38 ++++
 src/core/port_channel.cc                |  64 +++++-
 src/core/proxy.cc                       |  34 +--
 test/mp_unit/mp_unit_tests.hpp          |   8 +
 test/mp_unit/port_channel_tests.cu      | 289 ++++++++++++++++++++++++
 14 files changed, 507 insertions(+), 53 deletions(-)
 create mode 100644 src/core/include/proxy_impl.hpp

diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp
index ca2fc34f..45b56bcc 100644
--- a/include/mscclpp/core.hpp
+++ b/include/mscclpp/core.hpp
@@ -676,6 +676,8 @@ class Connection {
   friend class Semaphore;
   friend class ProxyService;
   friend class BaseConnection;
+  friend struct BasePortChannel;
+  friend struct PortChannel;
 };
 
 /// SemaphoreStub object only used for constructing Semaphore, not for direct use by the user.
diff --git a/include/mscclpp/fifo.hpp b/include/mscclpp/fifo.hpp
index 2ee793ad..6aae03b5 100644
--- a/include/mscclpp/fifo.hpp
+++ b/include/mscclpp/fifo.hpp
@@ -29,6 +29,11 @@ class Fifo {
   /// Remove the head trigger.
   void pop();
 
+  /// Get the current tail position — the FIFO push-return value of the trigger about to be
+  /// (or currently being) processed by the proxy thread. Monotonically increasing.
+  /// @return The current tail position.
+  uint64_t tail() const;
+
   /// Get FIFO size.
   /// @return Number of entries in the FIFO.
   int size() const;
diff --git a/include/mscclpp/port_channel.hpp b/include/mscclpp/port_channel.hpp
index ed660407..18d67524 100644
--- a/include/mscclpp/port_channel.hpp
+++ b/include/mscclpp/port_channel.hpp
@@ -84,8 +84,12 @@ class ProxyService : public BaseProxyService {
   std::vector<RegisteredMemory> memories_;
   std::shared_ptr<Proxy> proxy_;
   std::unordered_map<std::shared_ptr<BaseConnection>, int> inflightRequests_;
+  // Latest pending TriggerSync FIFO position per connection. Proxy publishes pos+1 to the
+  // connection's gpuFlushDonePos_ when the CQ drains, then erases the entry.
+  std::unordered_map<std::shared_ptr<BaseConnection>, uint64_t> pendingFlushPos_;
 
   ProxyHandlerResult handleTrigger(ProxyTrigger triggerRaw);
+  void progressFlushes();
 };
 
 /// Port channel without specifying source/destination memory regions.
diff --git a/include/mscclpp/port_channel_device.hpp b/include/mscclpp/port_channel_device.hpp
index adff3fcd..74fa3d89 100644
--- a/include/mscclpp/port_channel_device.hpp
+++ b/include/mscclpp/port_channel_device.hpp
@@ -17,6 +17,19 @@ using SemaphoreId = uint32_t;
 /// actual.
 using MemoryId = uint32_t;
 
+namespace detail {
+#if defined(MSCCLPP_DEVICE_COMPILE)
+/// Wait until the proxy has processed and drained the TriggerSync at FIFO position `fifoPos`.
+/// The proxy publishes `flushDonePos = latestCompletedPos + 1` when the CQ drains, so the
+/// wait condition `flushDonePos > fifoPos` is satisfied exactly when our own request has
+/// been completed. Using the FIFO push position as the wait target couples the wait to the
+/// FIFO order, avoiding races when multiple GPU threads concurrently flush the same channel.
+MSCCLPP_DEVICE_INLINE void waitFlush(uint64_t* flushDonePos, uint64_t fifoPos, [[maybe_unused]] int64_t maxSpinCount) {
+  POLL_MAYBE_JAILBREAK((atomicLoad<uint64_t, scopeSystem>(flushDonePos, memoryOrderAcquire) <= fifoPos), maxSpinCount);
+}
+#endif  // defined(MSCCLPP_DEVICE_COMPILE)
+}  // namespace detail
+
 struct BasePortChannelDeviceHandle {
   SemaphoreId semaphoreId_;
 
@@ -26,12 +39,16 @@ struct BasePortChannelDeviceHandle {
   // can produce for and the sole proxy thread consumes it.
   FifoDeviceHandle fifo_;
 
+  // One past the highest FIFO position with a completed flush on this connection.
+  // Host-pinned: proxy writes after CQ drain, GPU reads in waitFlush().
+  uint64_t* flushDonePos_;
+
   MSCCLPP_INLINE BasePortChannelDeviceHandle() = default;
 
   MSCCLPP_HOST_DEVICE_INLINE BasePortChannelDeviceHandle(SemaphoreId semaphoreId,
                                                          Host2DeviceSemaphoreDeviceHandle semaphore,
-                                                         FifoDeviceHandle fifo)
-      : semaphoreId_(semaphoreId), semaphore_(semaphore), fifo_(fifo) {}
+                                                         FifoDeviceHandle fifo, uint64_t* flushDonePos)
+      : semaphoreId_(semaphoreId), semaphore_(semaphore), fifo_(fifo), flushDonePos_(flushDonePos) {}
 
 #if defined(MSCCLPP_DEVICE_COMPILE)
   /// Push a TriggerData to the FIFO.
@@ -86,9 +103,9 @@ struct BasePortChannelDeviceHandle {
   /// @param maxSpinCount The maximum number of spin counts before asserting. Never assert if negative.
   MSCCLPP_DEVICE_INLINE void putWithSignalAndFlush(MemoryId dstId, uint64_t dstOffset, MemoryId srcId,
                                                    uint64_t srcOffset, uint64_t size, int64_t maxSpinCount = 1000000) {
-    uint64_t curFifoHead =
+    uint64_t pos =
         fifo_.push({TriggerData | TriggerFlag | TriggerSync, dstId, dstOffset, srcId, srcOffset, size, semaphoreId_});
-    fifo_.sync(curFifoHead, maxSpinCount);
+    detail::waitFlush(flushDonePos_, pos, maxSpinCount);
   }
 
   /// Push a TriggerData, a TriggerFlag, and a TriggerSync at the same time to the FIFO.
@@ -105,8 +122,8 @@ struct BasePortChannelDeviceHandle {
   /// Push a TriggerSync to the FIFO.
   /// @param maxSpinCount The maximum number of spin counts before asserting. Never assert if negative.
   MSCCLPP_DEVICE_INLINE void flush(int64_t maxSpinCount = 1000000) {
-    uint64_t curFifoHead = fifo_.push({TriggerSync, 0, 0, 0, 0, 1, semaphoreId_});
-    fifo_.sync(curFifoHead, maxSpinCount);
+    uint64_t pos = fifo_.push({TriggerSync, 0, 0, 0, 0, 1, semaphoreId_});
+    detail::waitFlush(flushDonePos_, pos, maxSpinCount);
   }
 
   /// Check if the port channel has been signaled.
@@ -128,8 +145,8 @@ struct PortChannelDeviceHandle : public BasePortChannelDeviceHandle {
 
   MSCCLPP_HOST_DEVICE_INLINE PortChannelDeviceHandle(SemaphoreId semaphoreId,
                                                      Host2DeviceSemaphoreDeviceHandle semaphore, FifoDeviceHandle fifo,
-                                                     MemoryId dst, MemoryId src)
-      : BasePortChannelDeviceHandle(semaphoreId, semaphore, fifo), dst_(dst), src_(src) {}
+                                                     MemoryId dst, MemoryId src, uint64_t* flushDonePos)
+      : BasePortChannelDeviceHandle(semaphoreId, semaphore, fifo, flushDonePos), dst_(dst), src_(src) {}
 
 #if defined(MSCCLPP_DEVICE_COMPILE)
   /// Push a TriggerData to the FIFO.
diff --git a/include/mscclpp/proxy.hpp b/include/mscclpp/proxy.hpp
index 990deabb..291206c0 100644
--- a/include/mscclpp/proxy.hpp
+++ b/include/mscclpp/proxy.hpp
@@ -20,8 +20,9 @@ enum class ProxyHandlerResult {
 };
 
 class Proxy;
+class ProxyService;
 
-/// Handler function type for proxy.
+/// Handler function type for proxy. Called once per ready FIFO trigger.
 using ProxyHandler = std::function<ProxyHandlerResult(ProxyTrigger)>;
 
 /// Host-side proxy for PortChannels.
@@ -54,6 +55,7 @@ class Proxy {
   std::shared_ptr<Fifo> fifo();
 
  private:
+  friend class ProxyService;
   struct Impl;
   std::unique_ptr<Impl> pimpl_;
 };
diff --git a/src/core/connection.cc b/src/core/connection.cc
index 11ecd968..e01c4a6e 100644
--- a/src/core/connection.cc
+++ b/src/core/connection.cc
@@ -7,13 +7,13 @@
 #include <mscclpp/npkit/npkit.hpp>
 #endif
 
-#include <mscclpp/atomic_device.hpp>
 #include <mscclpp/numa.hpp>
 #include <mscclpp/utils.hpp>
 #include <sstream>
 #include <thread>
 
 #include "api.h"
+#include "atomic.hpp"
 #include "context.hpp"
 #include "endpoint.hpp"
 #include "gpu_utils_internal.hpp"
@@ -43,7 +43,10 @@ const RegisteredMemory::Impl& BaseConnection::getImpl(const RegisteredMemory& me
 Context::Impl& BaseConnection::getImpl(Context& context) { return *(context.pimpl_); }
 
 MSCCLPP_API_CPP BaseConnection::BaseConnection(std::shared_ptr<Context> context, const Endpoint& localEndpoint)
-    : context_(context), localEndpoint_(localEndpoint), maxWriteQueueSize_(localEndpoint.maxWriteQueueSize()) {}
+    : context_(context),
+      localEndpoint_(localEndpoint),
+      maxWriteQueueSize_(localEndpoint.maxWriteQueueSize()),
+      gpuFlushDonePos_(detail::gpuCallocHostShared<uint64_t>()) {}
 
 MSCCLPP_API_CPP std::shared_ptr<Context> BaseConnection::context() const { return context_; }
 
@@ -489,6 +492,35 @@ void IBConnection::flush(int64_t timeoutUsec) {
 #endif
 }
 
+void IBConnection::requestFlush() {
+  // No-op: IB sends were already posted by prior conn.write() calls in handleTrigger.
+  // progressFlush() drives completion by polling the send CQ.
+}
+
+bool IBConnection::progressFlush() {
+  if (recvThreadError_.load(std::memory_order_acquire)) {
+    THROW(CONN, Error, ErrorCode::SystemError, "IBConnection recv thread failed: ", recvThreadErrorMsg_);
+  }
+
+  auto qp = qp_.lock();
+  if (!qp || qp->getNumSendCqItems() == 0) {
+    return true;  // QP expired or CQ already drained.
+  }
+
+  int wcNum = qp->pollSendCq();
+  if (wcNum < 0) {
+    THROW(NET, IbError, errno, "pollSendCq failed in progressFlush");
+  }
+  for (int i = 0; i < wcNum; ++i) {
+    int status = qp->getSendWcStatus(i);
+    if (status != static_cast<int>(WsStatus::Success)) {
+      THROW(NET, Error, ErrorCode::SystemError,
+            "an IB work item failed in progressFlush: ", qp->getSendWcStatusString(i));
+    }
+  }
+  return qp->getNumSendCqItems() == 0;
+}
+
 // EthernetConnection
 
 EthernetConnection::EthernetConnection(std::shared_ptr<Context> context, const Endpoint& localEndpoint,
diff --git a/src/core/fifo.cc b/src/core/fifo.cc
index e0ac9916..b11775d8 100644
--- a/src/core/fifo.cc
+++ b/src/core/fifo.cc
@@ -53,6 +53,8 @@ MSCCLPP_API_CPP void Fifo::pop() {
   atomicStore(pimpl_->tail.get(), curTail + 1, memoryOrderRelease);
 }
 
+MSCCLPP_API_CPP uint64_t Fifo::tail() const { return *(pimpl_->tail); }
+
 MSCCLPP_API_CPP int Fifo::size() const { return pimpl_->size; }
 
 MSCCLPP_API_CPP FifoDeviceHandle Fifo::deviceHandle() const {
diff --git a/src/core/include/atomic.hpp b/src/core/include/atomic.hpp
index 26f549f2..b6079162 100644
--- a/src/core/include/atomic.hpp
+++ b/src/core/include/atomic.hpp
@@ -4,18 +4,16 @@
 #ifndef MSCCLPP_ATOMIC_HPP_
 #define MSCCLPP_ATOMIC_HPP_
 
-#if defined(MSCCLPP_USE_CUDA)
-#ifndef MSCCLPP_DEVICE_CUDA
+// On CUDA host-side compiles, force atomic_device.hpp's CUDA branch so host code uses
+// cuda::atomic_ref (for system-scope ordering with GPU readers). On CUDA device compiles
+// (MSCCLPP_DEVICE_CUDA already set by device.hpp) and on ROCm builds, include normally —
+// atomic_device.hpp's branch selection works correctly without forcing.
+#if defined(MSCCLPP_USE_CUDA) && !defined(MSCCLPP_DEVICE_CUDA)
 #define MSCCLPP_DEVICE_CUDA
 #include <mscclpp/atomic_device.hpp>
 #undef MSCCLPP_DEVICE_CUDA
-#endif  // !defined(MSCCLPP_DEVICE_CUDA)
-#else   // !defined(MSCCLPP_USE_CUDA)
-#ifndef MSCCLPP_DEVICE_HIP
-#define MSCCLPP_DEVICE_HIP
+#else
 #include <mscclpp/atomic_device.hpp>
-#undef MSCCLPP_DEVICE_HIP
-#endif  // !defined(MSCCLPP_DEVICE_HIP)
-#endif  // !defined(MSCCLPP_USE_CUDA)
+#endif
 
 #endif  // MSCCLPP_ATOMIC_HPP_
\ No newline at end of file
diff --git a/src/core/include/connection.hpp b/src/core/include/connection.hpp
index 22a9930f..eda4b3ef 100644
--- a/src/core/include/connection.hpp
+++ b/src/core/include/connection.hpp
@@ -50,6 +50,23 @@ class BaseConnection {
   /// When false, the NIC writes directly to the semaphore's registered memory (e.g., via atomics).
   virtual bool isSignalForwarding() const { return false; }
 
+  /// Request a flush. Subclasses that support async flush (e.g. IBConnection) override this
+  /// to be a no-op and rely on progressFlush() to drive completion via the proxy thread.
+  /// The default does a blocking flush(); progressFlush() then trivially returns true.
+  /// @note Only call from the proxy thread.
+  virtual void requestFlush() { flush(); }
+
+  /// Progress pending async flush operations (non-blocking CQ poll).
+  /// @note Only call from the proxy thread.
+  /// @return true if no flush is pending (CQ fully drained or no request).
+  virtual bool progressFlush() { return true; }
+
+  /// Get pointer to the GPU-visible flush-done position (host-pinned memory).
+  /// ProxyService writes "one past the highest completed FIFO position" here when the CQ
+  /// drains; GPU threads spin on it (`waitFlush`) until it surpasses their own push position.
+  /// @note Pointer is valid for the lifetime of this Connection.
+  uint64_t* getFlushDonePtr() const { return gpuFlushDonePos_.get(); }
+
   virtual Transport transport() const = 0;
 
   virtual Transport remoteTransport() const = 0;
@@ -75,6 +92,11 @@ class BaseConnection {
   std::shared_ptr<Context> context_;
   Endpoint localEndpoint_;
   int maxWriteQueueSize_;
+
+  // GPU-visible flush-done position (host-pinned memory). ProxyService writes one past the
+  // highest FIFO position whose TriggerSync request has fully completed on this connection
+  // (CQ drained for IB, synchronous flush() returned for non-IB).
+  std::shared_ptr<uint64_t> gpuFlushDonePos_;
 };
 
 class CudaIpcConnection : public BaseConnection {
@@ -149,6 +171,9 @@ class IBConnection : public BaseConnection {
   void updateAndSync(RegisteredMemory dst, uint64_t dstOffset, uint64_t* src, uint64_t newValue) override;
 
   void flush(int64_t timeoutUsec) override;
+
+  void requestFlush() override;
+  bool progressFlush() override;
 };
 
 class EthernetConnection : public BaseConnection {
diff --git a/src/core/include/proxy_impl.hpp b/src/core/include/proxy_impl.hpp
new file mode 100644
index 00000000..a588e5df
--- /dev/null
+++ b/src/core/include/proxy_impl.hpp
@@ -0,0 +1,38 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef MSCCLPP_PROXY_IMPL_HPP_
+#define MSCCLPP_PROXY_IMPL_HPP_
+
+#include <atomic>
+#include <functional>
+#include <memory>
+#include <mscclpp/fifo.hpp>
+#include <mscclpp/proxy.hpp>
+#include <thread>
+
+namespace mscclpp {
+
+struct Proxy::Impl {
+  ProxyHandler handler;
+  std::function<void()> threadInit;
+  std::function<void()> progressHandler;
+  std::shared_ptr<Fifo> fifo;
+  std::atomic_bool threadStarted;
+  std::thread service;
+  std::atomic_bool running;
+
+  Impl(ProxyHandler handler, std::function<void()> threadInit, int fifoSize)
+      : handler(handler),
+        threadInit(threadInit),
+        fifo(std::make_shared<Fifo>(fifoSize)),
+        threadStarted(false),
+        running(false) {}
+
+  // Must be called before start() — the proxy thread captures progressHandler at start time.
+  void setProgressHandler(std::function<void()> h) { progressHandler = std::move(h); }
+};
+
+}  // namespace mscclpp
+
+#endif  // MSCCLPP_PROXY_IMPL_HPP_
diff --git a/src/core/port_channel.cc b/src/core/port_channel.cc
index b8242db3..210b2eeb 100644
--- a/src/core/port_channel.cc
+++ b/src/core/port_channel.cc
@@ -1,11 +1,14 @@
 // Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
+// Licensed under the MIT License.
 
 #include <mscclpp/numa.hpp>
 #include <mscclpp/port_channel.hpp>
 
 #include "api.h"
-#include "debug.h"
+#include "atomic.hpp"
+#include "connection.hpp"
+#include "logger.hpp"
+#include "proxy_impl.hpp"
 
 namespace mscclpp {
 
@@ -34,11 +37,12 @@ MSCCLPP_API_CPP ProxyService::ProxyService(int fifoSize) {
     MSCCLPP_CUDATHROW(cudaSetDevice(cudaDevice));
     if (deviceNumaNode >= 0) {
       numaBind(deviceNumaNode);
-      INFO(MSCCLPP_INIT, "NUMA node of ProxyService proxy thread is set to %d", deviceNumaNode);
+      INFO(CONN, "NUMA node of ProxyService proxy thread is set to ", deviceNumaNode);
     }
   };
   auto handlerFunc = [&](ProxyTrigger triggerRaw) { return handleTrigger(triggerRaw); };
   proxy_ = std::make_shared<Proxy>(handlerFunc, initFunc, fifoSize);
+  proxy_->pimpl_->setProgressHandler([this]() { progressFlushes(); });
 }
 
 MSCCLPP_API_CPP SemaphoreId ProxyService::buildAndAddSemaphore(Communicator& communicator,
@@ -84,9 +88,28 @@ MSCCLPP_API_CPP PortChannel ProxyService::portChannel(SemaphoreId id, MemoryId d
 
 MSCCLPP_API_CPP void ProxyService::startProxy(bool blocking) { proxy_->start(blocking); }
 
-MSCCLPP_API_CPP void ProxyService::stopProxy() { proxy_->stop(); }
+MSCCLPP_API_CPP void ProxyService::stopProxy() {
+  proxy_->stop();
+  // Drain pending flushes. After a bounded loop, force-unblock any still-pending GPU
+  // waiters with a sentinel write (UINT64_MAX > any FIFO position).
+  for (int i = 0; i < 1000 && !pendingFlushPos_.empty(); ++i) {
+    progressFlushes();
+  }
+  if (!pendingFlushPos_.empty()) {
+    WARN(CONN, "stopProxy: ", pendingFlushPos_.size(), " connections still pending; writing sentinel");
+    for (auto& [conn, pos] : pendingFlushPos_) {
+      if (uint64_t* ptr = conn->getFlushDonePtr()) atomicStore(ptr, UINT64_MAX, memoryOrderRelease);
+    }
+    pendingFlushPos_.clear();
+  }
+}
 
 ProxyHandlerResult ProxyService::handleTrigger(ProxyTrigger trigger) {
+  // The proxy is the sole FIFO consumer and processes in strict push order, so the FIFO's
+  // tail (between poll() and pop()) matches the value GPU's fifo_.push() returned for this
+  // trigger — use it directly as our per-trigger sequence number.
+  uint64_t pos = proxy_->fifo()->tail();
+
   std::shared_ptr<Host2DeviceSemaphore> semaphore = semaphores_[trigger.fields.semaphoreId];
 
   auto& conn = semaphore->connection();
@@ -105,9 +128,15 @@ ProxyHandlerResult ProxyService::handleTrigger(ProxyTrigger trigger) {
     numRequests++;
   }
 
-  if (((trigger.fields.type & TriggerSync) && numRequests > 0) ||
-      (maxWriteQueueSize != -1 && numRequests >= maxWriteQueueSize)) {
-    conn.flush();
+  if (trigger.fields.type & TriggerSync) {
+    // Record this TriggerSync's FIFO position. The GPU caller is spinning on
+    // flushDonePos_ > pos; progressFlushes() will publish pos+1 once the CQ drains.
+    // Later TriggerSyncs on the same conn overwrite — CQ drain completes them all at once.
+    conn.impl_->requestFlush();
+    pendingFlushPos_[conn.impl_] = pos;
+    numRequests = 0;
+  } else if (maxWriteQueueSize != -1 && numRequests >= maxWriteQueueSize) {
+    conn.flush();  // flow-control flush stays blocking
     numRequests = 0;
   }
 
@@ -115,12 +144,27 @@ ProxyHandlerResult ProxyService::handleTrigger(ProxyTrigger trigger) {
 }
 
 MSCCLPP_API_CPP BasePortChannel::DeviceHandle BasePortChannel::deviceHandle() const {
-  return BasePortChannel::DeviceHandle(semaphoreId_, semaphore_->deviceHandle(), proxy_->fifo()->deviceHandle());
+  auto& conn = semaphore_->connection();
+  return BasePortChannel::DeviceHandle(semaphoreId_, semaphore_->deviceHandle(), proxy_->fifo()->deviceHandle(),
+                                       conn.impl_->getFlushDonePtr());
 }
 
 MSCCLPP_API_CPP PortChannel::DeviceHandle PortChannel::deviceHandle() const {
-  return PortChannel::DeviceHandle(semaphoreId_, semaphore_->deviceHandle(), proxy_->fifo()->deviceHandle(), dst_,
-                                   src_);
+  auto& conn = semaphore_->connection();
+  return PortChannel::DeviceHandle(semaphoreId_, semaphore_->deviceHandle(), proxy_->fifo()->deviceHandle(), dst_, src_,
+                                   conn.impl_->getFlushDonePtr());
+}
+
+void ProxyService::progressFlushes() {
+  for (auto it = pendingFlushPos_.begin(); it != pendingFlushPos_.end();) {
+    if (it->first->progressFlush()) {
+      // CQ drained: publish pos+1 to unblock GPU waiters whose own pos <= recorded pos.
+      atomicStore(it->first->getFlushDonePtr(), it->second + 1, memoryOrderRelease);
+      it = pendingFlushPos_.erase(it);
+    } else {
+      ++it;
+    }
+  }
 }
 
 }  // namespace mscclpp
diff --git a/src/core/proxy.cc b/src/core/proxy.cc
index de5b90fc..554336e8 100644
--- a/src/core/proxy.cc
+++ b/src/core/proxy.cc
@@ -1,38 +1,21 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#include <atomic>
 #include <mscclpp/core.hpp>
 #include <mscclpp/gpu_utils.hpp>
 #include <mscclpp/numa.hpp>
 #include <mscclpp/proxy.hpp>
 #include <mscclpp/utils.hpp>
-#include <thread>
 
 #include "api.h"
-#include "debug.h"
+#include "logger.hpp"
+#include "proxy_impl.hpp"
 
 namespace mscclpp {
 
 constexpr int ProxyStopCheckPeriod = 1000;
 constexpr int ProxyStartWarnPeriod = 1000;
 
-struct Proxy::Impl {
-  ProxyHandler handler;
-  std::function<void()> threadInit;
-  std::shared_ptr<Fifo> fifo;
-  std::atomic_bool threadStarted;
-  std::thread service;
-  std::atomic_bool running;
-
-  Impl(ProxyHandler handler, std::function<void()> threadInit, int fifoSize)
-      : handler(handler),
-        threadInit(threadInit),
-        fifo(std::make_shared<Fifo>(fifoSize)),
-        threadStarted(false),
-        running(false) {}
-};
-
 MSCCLPP_API_CPP Proxy::Proxy(ProxyHandler handler, std::function<void()> threadInit, int fifoSize) {
   pimpl_ = std::make_unique<Impl>(handler, threadInit, fifoSize);
 }
@@ -70,18 +53,23 @@ MSCCLPP_API_CPP void Proxy::start(bool blocking) {
 
     pimpl_->threadStarted.store(true, std::memory_order_release);
 
-    ProxyHandler handler = this->pimpl_->handler;
-    auto fifo = this->pimpl_->fifo;
+    ProxyHandler handler = pimpl_->handler;
+    auto progressHandler = pimpl_->progressHandler;
+    auto fifo = pimpl_->fifo;
     ProxyTrigger trigger;
 
     int runCnt = ProxyStopCheckPeriod;
     for (;;) {
       if (runCnt-- == 0) {
         runCnt = ProxyStopCheckPeriod;
-        if (!this->pimpl_->running.load(std::memory_order_acquire)) {
+        if (!pimpl_->running.load(std::memory_order_acquire)) {
           break;
         }
       }
+      // Per-iteration system work (e.g. progressing pending async flushes),
+      // run regardless of whether a trigger is ready this iteration.
+      if (progressHandler) progressHandler();
+
       // Poll to see if we are ready to send anything
       trigger = fifo->poll();
       if (trigger.fst == 0 || trigger.snd == 0) {  // TODO: this check is a potential pitfall for custom triggers
@@ -107,7 +95,7 @@ MSCCLPP_API_CPP void Proxy::start(bool blocking) {
       count--;
       if (count == 0) {
         count = ProxyStartWarnPeriod;
-        WARN("Proxy thread startup taking longer than expected.");
+        WARN(CONN, "Proxy thread startup taking longer than expected.");
       }
     }
   }
diff --git a/test/mp_unit/mp_unit_tests.hpp b/test/mp_unit/mp_unit_tests.hpp
index f4a26cf9..eb8b5485 100644
--- a/test/mp_unit/mp_unit_tests.hpp
+++ b/test/mp_unit/mp_unit_tests.hpp
@@ -160,6 +160,14 @@ class PortChannelOneToOneTest : public CommunicatorTestBase {
   void testPacketPingPong(bool useIbOnly, IbMode ibMode = IbMode::Default);
   void testPacketPingPongPerf(bool useIbOnly, IbMode ibMode = IbMode::Default);
   void testBandwidth(PingPongTestParams params);
+  void setupMultiQpChannels(int numQps, size_t elemsPerChan, IbMode ibMode, int tagBase,
+                            std::vector<std::shared_ptr<int>>& sendBuffs,
+                            std::vector<mscclpp::RegisteredMemory>& localMems,
+                            std::vector<mscclpp::RegisteredMemory>& remoteMems,
+                            std::vector<mscclpp::PortChannel>& portChannels);
+  void testMultiQpBandwidth(IbMode ibMode, int numQps);
+  void testMultiQpFlushStress(IbMode ibMode, int numQps);
+  void testSameChanConcurrentFlush(IbMode ibMode);
 
   std::shared_ptr<mscclpp::ProxyService> proxyService;
 };
diff --git a/test/mp_unit/port_channel_tests.cu b/test/mp_unit/port_channel_tests.cu
index 4b1b0cfb..47034cdb 100644
--- a/test/mp_unit/port_channel_tests.cu
+++ b/test/mp_unit/port_channel_tests.cu
@@ -625,3 +625,292 @@ PERF_TEST(PortChannelOneToOneTest, BandwidthIbHostNoAtomicMode) {
   testBandwidth(PingPongTestParams{
       .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::HostNoAtomic});
 }
+
+static constexpr int kMaxQps = 4;
+__constant__ DeviceHandle<mscclpp::PortChannel> gMultiQpPortChans[kMaxQps];
+
+// Multi-QP bandwidth kernel: barrier on QP 0 only, then putWithSignal on all QPs.
+// Only one signal/wait pair is needed for sync between two GPU kernels.
+__global__ void kernelMultiQpBandwidth(int nElemPerChan, int nIters, int numQps) {
+  if (threadIdx.x != 0) return;
+  for (int i = 0; i < nIters; i++) {
+    // Barrier on QP 0 only — syncs both ranks
+    gMultiQpPortChans[0].signal();
+    gMultiQpPortChans[0].wait();
+    // Data transfer: put on all QPs simultaneously
+    for (int q = 0; q < numQps; q++) {
+      gMultiQpPortChans[q].putWithSignal(0, nElemPerChan * sizeof(int));
+    }
+    // Wait for all remote data arrivals
+    for (int q = 0; q < numQps; q++) {
+      gMultiQpPortChans[q].wait();
+    }
+  }
+}
+
+// Multi-QP setup helper: bootstrap N parallel IB connections + port channels in two
+// futures-based phases (issue all async ops before resolving any, to avoid deadlock).
+// tagBase: distinct base used by each caller so concurrent tests don't clash on tags.
+void PortChannelOneToOneTest::setupMultiQpChannels(int numQps, size_t elemsPerChan, IbMode ibMode, int tagBase,
+                                                   std::vector<std::shared_ptr<int>>& sendBuffs,
+                                                   std::vector<mscclpp::RegisteredMemory>& localMems,
+                                                   std::vector<mscclpp::RegisteredMemory>& remoteMems,
+                                                   std::vector<mscclpp::PortChannel>& portChannels) {
+  const int peer = 1 - communicator->bootstrap()->getRank();
+  sendBuffs.assign(numQps, nullptr);
+  localMems.assign(numQps, mscclpp::RegisteredMemory{});
+  remoteMems.assign(numQps, mscclpp::RegisteredMemory{});
+  portChannels.clear();
+
+  std::vector<std::shared_future<mscclpp::Connection>> connFutures(numQps);
+  std::vector<std::shared_future<mscclpp::RegisteredMemory>> remoteMemFutures(numQps);
+
+  for (int q = 0; q < numQps; q++) {
+    sendBuffs[q] = mscclpp::GpuBuffer<int>(elemsPerChan).memory();
+    localMems[q] = communicator->registerMemory(sendBuffs[q].get(), elemsPerChan * sizeof(int), ibTransport);
+
+    mscclpp::EndpointConfig cfg;
+    cfg.transport = ibTransport;
+    cfg.ib.gidIndex = std::stoi(gEnv->args["ib_gid_index"]);
+    cfg.ib.mode = ibMode;
+
+    connFutures[q] = communicator->connect(cfg, peer, tagBase + q);
+    communicator->sendMemory(localMems[q], peer, tagBase + numQps + q);
+    remoteMemFutures[q] = communicator->recvMemory(peer, tagBase + numQps + q);
+  }
+
+  for (int q = 0; q < numQps; q++) {
+    auto conn = connFutures[q].get();
+    remoteMems[q] = remoteMemFutures[q].get();
+    auto sema = communicator->buildSemaphore(conn, peer, tagBase + 2 * numQps + q).get();
+    mscclpp::SemaphoreId cid = proxyService->addSemaphore(sema);
+    portChannels.emplace_back(
+        proxyService->portChannel(cid, proxyService->addMemory(remoteMems[q]), proxyService->addMemory(localMems[q])));
+  }
+}
+
+void PortChannelOneToOneTest::testMultiQpBandwidth(IbMode ibMode, int numQps) {
+  if (gEnv->rank >= numRanksToUse) return;
+
+  const int rank = communicator->bootstrap()->getRank();
+  const int maxElemPerChan = 32 * 1024 * 1024;  // 128 MB per channel
+
+  std::vector<std::shared_ptr<int>> sendBuffs;
+  std::vector<mscclpp::RegisteredMemory> localMems;
+  std::vector<mscclpp::RegisteredMemory> remoteMems;
+  std::vector<mscclpp::PortChannel> portChannels;
+  setupMultiQpChannels(numQps, maxElemPerChan, ibMode, /*tagBase=*/100, sendBuffs, localMems, remoteMems, portChannels);
+
+  std::vector<DeviceHandle<mscclpp::PortChannel>> handles;
+  for (auto& ch : portChannels) handles.push_back(ch.deviceHandle());
+  ASSERT_EQ(handles.size(), static_cast<size_t>(numQps));
+  ASSERT_LE(numQps, kMaxQps);  // numQps must not exceed __constant__ array size (kMaxQps)
+  MSCCLPP_CUDATHROW(
+      cudaMemcpyToSymbol(gMultiQpPortChans, handles.data(), numQps * sizeof(DeviceHandle<mscclpp::PortChannel>)));
+
+  proxyService->startProxy();
+
+  const std::string testName = ::mscclpp::test::currentTestName();
+  const std::string qpLabel = std::to_string(numQps) + " QP" + (numQps > 1 ? "s" : "");
+
+  for (int nElemPerChan :
+       {256, 16 * 1024, 256 * 1024, 1024 * 1024, 4 * 1024 * 1024, 16 * 1024 * 1024, 32 * 1024 * 1024}) {
+    int nIters = 10000;
+    // Warm-up
+    kernelMultiQpBandwidth<<<1, 1>>>(nElemPerChan, 10, numQps);
+    MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+    communicator->bootstrap()->barrier();
+
+    // Measure
+    mscclpp::Timer timer;
+    kernelMultiQpBandwidth<<<1, 1>>>(nElemPerChan, nIters, numQps);
+    MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+    double elapsedUs = timer.elapsed();
+    communicator->bootstrap()->barrier();
+
+    if (rank == 0) {
+      double totalBytes = (double)nElemPerChan * sizeof(int) * numQps;
+      double elapsedMsPerIter = elapsedUs / 1e3 / nIters;
+      double gbps = totalBytes / elapsedMsPerIter * 1e-6;
+      double totalSizeKB = totalBytes / 1024.0;
+      std::string label;
+      if (totalSizeKB >= 1024.0)
+        label = std::to_string((int)(totalSizeKB / 1024.0)) + " MB";
+      else
+        label = std::to_string((int)totalSizeKB) + " KB";
+      ::mscclpp::test::reportPerfResult(label + " (" + qpLabel + ")", gbps, "GB/s");
+    }
+  }
+
+  proxyService->stopProxy();
+
+  for (auto& m : localMems) registeredMemories.push_back(m);
+  for (auto& m : remoteMems) registeredMemories.push_back(m);
+}
+
+PERF_TEST(PortChannelOneToOneTest, MultiQpBandwidthIbHostMode) {
+  REQUIRE_IBVERBS;
+  REQUIRE_GDR_FOR_IB_MODE(IbMode::Host);
+  for (int numQps : {1, 2, 4}) {
+    testMultiQpBandwidth(IbMode::Host, numQps);
+  }
+}
+
+PERF_TEST(PortChannelOneToOneTest, MultiQpBandwidthIbHostNoAtomicMode) {
+  REQUIRE_IBVERBS;
+  REQUIRE_GDR_FOR_IB_MODE(IbMode::HostNoAtomic);
+  for (int numQps : {1, 2, 4}) {
+    testMultiQpBandwidth(IbMode::HostNoAtomic, numQps);
+  }
+}
+
+// Multi-QP flush-stress kernel: one thread per QP, all calling putWithSignalAndFlush
+// concurrently so all N CQ drains are in flight on the proxy thread at once.
+// This is the concurrent-flush worst case the async-progress design protects against.
+__global__ void kernelMultiQpFlushStress(int nElemPerChan, int nIters, int numQps) {
+  int q = threadIdx.x;
+  if (q >= numQps) return;
+  for (int i = 0; i < nIters; i++) {
+    if (q == 0) {
+      gMultiQpPortChans[0].signal();
+      gMultiQpPortChans[0].wait();
+    }
+    __syncthreads();
+    gMultiQpPortChans[q].putWithSignalAndFlush(0, nElemPerChan * sizeof(int));
+    gMultiQpPortChans[q].wait();
+    __syncthreads();
+  }
+}
+
+void PortChannelOneToOneTest::testMultiQpFlushStress(IbMode ibMode, int numQps) {
+  if (gEnv->rank >= numRanksToUse) return;
+
+  const int rank = communicator->bootstrap()->getRank();
+  const int maxElemPerChan = 64 * 1024;
+
+  std::vector<std::shared_ptr<int>> sendBuffs;
+  std::vector<mscclpp::RegisteredMemory> localMems;
+  std::vector<mscclpp::RegisteredMemory> remoteMems;
+  std::vector<mscclpp::PortChannel> portChannels;
+  setupMultiQpChannels(numQps, maxElemPerChan, ibMode, /*tagBase=*/400, sendBuffs, localMems, remoteMems, portChannels);
+
+  std::vector<DeviceHandle<mscclpp::PortChannel>> handles;
+  for (auto& ch : portChannels) handles.push_back(ch.deviceHandle());
+  ASSERT_EQ(handles.size(), static_cast<size_t>(numQps));
+  ASSERT_LE(numQps, kMaxQps);
+  MSCCLPP_CUDATHROW(
+      cudaMemcpyToSymbol(gMultiQpPortChans, handles.data(), numQps * sizeof(DeviceHandle<mscclpp::PortChannel>)));
+
+  proxyService->startProxy();
+
+  const std::string qpLabel = std::to_string(numQps) + " QP" + (numQps > 1 ? "s" : "");
+
+  for (int nElemPerChan : {256, 4 * 1024, 64 * 1024}) {
+    int nIters = 2000;
+    kernelMultiQpFlushStress<<<1, numQps>>>(nElemPerChan, 10, numQps);
+    MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+    communicator->bootstrap()->barrier();
+
+    mscclpp::Timer timer;
+    kernelMultiQpFlushStress<<<1, numQps>>>(nElemPerChan, nIters, numQps);
+    MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+    double elapsedUs = timer.elapsed();
+    communicator->bootstrap()->barrier();
+
+    if (rank == 0) {
+      double usPerIter = elapsedUs / nIters;
+      double usPerIterPerQp = usPerIter / numQps;
+      int bytesPerChan = nElemPerChan * (int)sizeof(int);
+      std::string sizeLabel = (bytesPerChan >= 1024) ? (std::to_string(bytesPerChan / 1024) + " KB")
+                                                     : (std::to_string(bytesPerChan) + " B");
+      ::mscclpp::test::reportPerfResult(sizeLabel + " (" + qpLabel + ") per-iter", usPerIter, "us");
+      ::mscclpp::test::reportPerfResult(sizeLabel + " (" + qpLabel + ") per-iter/QP", usPerIterPerQp, "us");
+    }
+  }
+
+  proxyService->stopProxy();
+
+  for (auto& m : localMems) registeredMemories.push_back(m);
+  for (auto& m : remoteMems) registeredMemories.push_back(m);
+}
+
+PERF_TEST(PortChannelOneToOneTest, MultiQpFlushStressIbHostMode) {
+  REQUIRE_IBVERBS;
+  REQUIRE_GDR_FOR_IB_MODE(IbMode::Host);
+  for (int numQps : {1, 2, 4}) {
+    testMultiQpFlushStress(IbMode::Host, numQps);
+  }
+}
+
+PERF_TEST(PortChannelOneToOneTest, MultiQpFlushStressIbHostNoAtomicMode) {
+  REQUIRE_IBVERBS;
+  REQUIRE_GDR_FOR_IB_MODE(IbMode::HostNoAtomic);
+  for (int numQps : {1, 2, 4}) {
+    testMultiQpFlushStress(IbMode::HostNoAtomic, numQps);
+  }
+}
+
+// Same-channel concurrent-flush kernel: N GPU threads on the same PortChannel each call
+// putWithSignalAndFlush in lockstep. Stresses the FIFO-position-based wait target so that
+// each caller waits on its own TriggerSync rather than on a globally-incrementing counter
+// that could be assigned out-of-order relative to the FIFO push order.
+__constant__ DeviceHandle<mscclpp::PortChannel> gSingleChanForConcurrentFlush;
+
+__global__ void kernelSameChanConcurrentFlush(int nIters) {
+  auto& chan = gSingleChanForConcurrentFlush;
+  int tid = threadIdx.x;
+  for (int i = 0; i < nIters; i++) {
+    // Each thread writes to a distinct slot (so puts don't overlap on remote side),
+    // then concurrently flushes on the same channel.
+    uint64_t offset = tid * sizeof(int);
+    chan.putWithSignalAndFlush(offset, offset, sizeof(int));
+    // Each thread waits for one signal from the remote rank's symmetric putWithSignalAndFlush.
+    chan.wait();
+  }
+}
+
+void PortChannelOneToOneTest::testSameChanConcurrentFlush(IbMode ibMode) {
+  if (gEnv->rank >= numRanksToUse) return;
+
+  constexpr int nThreads = 4;
+  std::vector<std::shared_ptr<int>> sendBuffs;
+  std::vector<mscclpp::RegisteredMemory> localMems;
+  std::vector<mscclpp::RegisteredMemory> remoteMems;
+  std::vector<mscclpp::PortChannel> portChannels;
+  setupMultiQpChannels(/*numQps=*/1, /*elemsPerChan=*/nThreads, ibMode, /*tagBase=*/700, sendBuffs, localMems,
+                       remoteMems, portChannels);
+
+  DeviceHandle<mscclpp::PortChannel> handle = portChannels[0].deviceHandle();
+  MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gSingleChanForConcurrentFlush, &handle, sizeof(handle)));
+
+  proxyService->startProxy();
+
+  // Warm-up
+  kernelSameChanConcurrentFlush<<<1, nThreads>>>(10);
+  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+  communicator->bootstrap()->barrier();
+
+  // Measure: a successful completion (no deadlock, no CQ error) validates that each
+  // concurrent-flush caller waited on its own TriggerSync (not someone else's earlier one).
+  const int nIters = 500;
+  mscclpp::Timer timer;
+  kernelSameChanConcurrentFlush<<<1, nThreads>>>(nIters);
+  MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
+  double elapsedUs = timer.elapsed();
+  communicator->bootstrap()->barrier();
+
+  if (communicator->bootstrap()->getRank() == 0) {
+    double usPerIter = elapsedUs / nIters;
+    ::mscclpp::test::reportPerfResult(std::to_string(nThreads) + " threads same-chan per-iter", usPerIter, "us");
+  }
+
+  proxyService->stopProxy();
+  for (auto& m : localMems) registeredMemories.push_back(m);
+  for (auto& m : remoteMems) registeredMemories.push_back(m);
+}
+
+TEST(PortChannelOneToOneTest, SameChanConcurrentFlushIbHostMode) {
+  REQUIRE_IBVERBS;
+  REQUIRE_GDR_FOR_IB_MODE(IbMode::Host);
+  testSameChanConcurrentFlush(IbMode::Host);
+}

From 60a6d7219f1d97c32c275864ed3fb456b6b3f300 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Fri, 15 May 2026 14:06:50 -0700
Subject: [PATCH 11/12] Clean up completed communicator receives (#804)

## Summary
- Release the reference after last requests are ready.
- Keep ordered receive chaining for repeated rank/tag operations while
cleaning up completed receive bookkeeping.

---------

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/core/communicator.cc          | 102 +++++++++++++++++-------------
 src/core/include/communicator.hpp |   5 +-
 2 files changed, 62 insertions(+), 45 deletions(-)

diff --git a/src/core/communicator.cc b/src/core/communicator.cc
index c95ca421..41e46bc5 100644
--- a/src/core/communicator.cc
+++ b/src/core/communicator.cc
@@ -7,6 +7,36 @@
 
 namespace mscclpp {
 
+namespace {
+
+template <typename T, typename Impl, typename Func>
+std::shared_future<T> makeOrderedRecvFuture(Impl* impl, int remoteRank, int tag, Func func) {
+  // Weak placeholder to avoid a reference cycle; updated with the real recvItem after the future is created.
+  auto thisRecvItem = std::make_shared<std::weak_ptr<BaseRecvItem>>();
+  auto future = std::async(std::launch::deferred,
+                           [impl, remoteRank, tag, thisRecvItem, lastRecvItem = impl->getLastRecvItem(remoteRank, tag),
+                            func = std::move(func)]() mutable {
+                             auto cleanup = [impl, remoteRank, tag, thisRecvItem]() {
+                               impl->clearLastRecvItemIfMatches(remoteRank, tag, thisRecvItem->lock());
+                             };
+
+                             if (lastRecvItem) {
+                               // Recursive call to the previous receive items
+                               lastRecvItem->wait();
+                             }
+                             auto result = func();
+                             cleanup();
+                             return result;
+                           });
+  auto sharedFuture = std::shared_future<T>(std::move(future));
+  auto recvItem = std::make_shared<RecvItem<T>>(sharedFuture);
+  *thisRecvItem = recvItem;
+  impl->setLastRecvItem(remoteRank, tag, recvItem);
+  return sharedFuture;
+}
+
+}  // namespace
+
 Communicator::Impl::Impl(std::shared_ptr<Bootstrap> bootstrap, std::shared_ptr<Context> context)
     : bootstrap_(bootstrap) {
   if (!context) {
@@ -32,6 +62,14 @@ std::shared_ptr<BaseRecvItem> Communicator::Impl::getLastRecvItem(int remoteRank
   return it->second;
 }
 
+void Communicator::Impl::clearLastRecvItemIfMatches(int remoteRank, int tag,
+                                                    const std::shared_ptr<BaseRecvItem>& expectedItem) {
+  auto it = lastRecvItems_.find({remoteRank, tag});
+  if (it != lastRecvItems_.end() && it->second == expectedItem) {
+    lastRecvItems_.erase(it);
+  }
+}
+
 MSCCLPP_API_CPP Communicator::~Communicator() = default;
 
 MSCCLPP_API_CPP Communicator::Communicator(std::shared_ptr<Bootstrap> bootstrap, std::shared_ptr<Context> context)
@@ -83,19 +121,11 @@ MSCCLPP_API_CPP std::shared_future<RegisteredMemory> Communicator::recvMemory(in
     locRecvMemList.push_back(std::move(locRecvMem));
     return future;
   }
-  auto future = std::async(std::launch::deferred,
-                           [this, remoteRank, tag, lastRecvItem = pimpl_->getLastRecvItem(remoteRank, tag)]() {
-                             if (lastRecvItem) {
-                               // Recursive call to the previous receive items
-                               lastRecvItem->wait();
-                             }
-                             std::vector<char> data;
-                             bootstrap()->recv(data, remoteRank, tag);
-                             return RegisteredMemory::deserialize(data);
-                           });
-  auto shared_future = std::shared_future<RegisteredMemory>(std::move(future));
-  pimpl_->setLastRecvItem(remoteRank, tag, std::make_shared<RecvItem<RegisteredMemory>>(shared_future));
-  return shared_future;
+  return makeOrderedRecvFuture<RegisteredMemory>(pimpl_.get(), remoteRank, tag, [this, remoteRank, tag]() {
+    std::vector<char> data;
+    bootstrap()->recv(data, remoteRank, tag);
+    return RegisteredMemory::deserialize(data);
+  });
 }
 
 MSCCLPP_API_CPP std::shared_future<Connection> Communicator::connect(const Endpoint& localEndpoint, int remoteRank,
@@ -112,22 +142,15 @@ MSCCLPP_API_CPP std::shared_future<Connection> Communicator::connect(const Endpo
 
   bootstrap()->send(localEndpoint.serialize(), remoteRank, tag);
 
-  auto future = std::async(std::launch::deferred, [this, remoteRank, tag, localEndpoint,
-                                                   lastRecvItem = pimpl_->getLastRecvItem(remoteRank, tag)]() mutable {
-    if (lastRecvItem) {
-      // Recursive call to the previous receive items
-      lastRecvItem->wait();
-    }
-    std::vector<char> data;
-    bootstrap()->recv(data, remoteRank, tag);
-    auto remoteEndpoint = Endpoint::deserialize(data);
-    auto connection = context()->connect(localEndpoint, remoteEndpoint);
-    pimpl_->connectionInfos_[connection.impl_.get()] = {remoteRank, tag};
-    return connection;
-  });
-  auto shared_future = std::shared_future<Connection>(std::move(future));
-  pimpl_->setLastRecvItem(remoteRank, tag, std::make_shared<RecvItem<Connection>>(shared_future));
-  return shared_future;
+  return makeOrderedRecvFuture<Connection>(pimpl_.get(), remoteRank, tag,
+                                           [this, remoteRank, tag, localEndpoint]() mutable {
+                                             std::vector<char> data;
+                                             bootstrap()->recv(data, remoteRank, tag);
+                                             auto remoteEndpoint = Endpoint::deserialize(data);
+                                             auto connection = context()->connect(localEndpoint, remoteEndpoint);
+                                             pimpl_->connectionInfos_[connection.impl_.get()] = {remoteRank, tag};
+                                             return connection;
+                                           });
 }
 
 MSCCLPP_API_CPP std::shared_future<Connection> Communicator::connect(const EndpointConfig& localConfig, int remoteRank,
@@ -141,21 +164,12 @@ MSCCLPP_API_CPP std::shared_future<Semaphore> Communicator::buildSemaphore(const
   SemaphoreStub localStub(connection);
   bootstrap()->send(localStub.serialize(), remoteRank, tag);
 
-  auto future =
-      std::async(std::launch::deferred, [this, remoteRank, tag, lastRecvItem = pimpl_->getLastRecvItem(remoteRank, tag),
-                                         localStub = localStub]() mutable {
-        if (lastRecvItem) {
-          // Recursive call to the previous receive items
-          lastRecvItem->wait();
-        }
-        std::vector<char> data;
-        bootstrap()->recv(data, remoteRank, tag);
-        auto remoteStub = SemaphoreStub::deserialize(data);
-        return Semaphore(localStub, remoteStub);
-      });
-  auto shared_future = std::shared_future<Semaphore>(std::move(future));
-  pimpl_->setLastRecvItem(remoteRank, tag, std::make_shared<RecvItem<Semaphore>>(shared_future));
-  return shared_future;
+  return makeOrderedRecvFuture<Semaphore>(pimpl_.get(), remoteRank, tag, [this, remoteRank, tag, localStub]() mutable {
+    std::vector<char> data;
+    bootstrap()->recv(data, remoteRank, tag);
+    auto remoteStub = SemaphoreStub::deserialize(data);
+    return Semaphore(localStub, remoteStub);
+  });
 }
 
 MSCCLPP_API_CPP int Communicator::remoteRankOf(const Connection& connection) {
diff --git a/src/core/include/communicator.hpp b/src/core/include/communicator.hpp
index 8d7539ef..f15e20f7 100644
--- a/src/core/include/communicator.hpp
+++ b/src/core/include/communicator.hpp
@@ -62,7 +62,7 @@ struct Communicator::Impl {
   std::unordered_map<const BaseConnection*, ConnectionInfo> connectionInfos_;
 
   // Temporary storage for the latest RecvItem of each {remoteRank, tag} pair.
-  // If the RecvItem gets ready, it will be removed at the next call to getLastRecvItem.
+  // The RecvItem is removed when it finishes or when getLastRecvItem observes that it is ready.
   std::unordered_map<std::pair<int, int>, std::shared_ptr<BaseRecvItem>, PairHash> lastRecvItems_;
 
   // RegisteredMemory items sent to the local rank of each tag. Sending memory to the local rank is
@@ -79,6 +79,9 @@ struct Communicator::Impl {
   // If the item is ready, it will be removed from the map and nullptr will be returned.
   std::shared_ptr<BaseRecvItem> getLastRecvItem(int remoteRank, int tag);
 
+  // Clear the last RecvItem only if it still matches the expected item.
+  void clearLastRecvItemIfMatches(int remoteRank, int tag, const std::shared_ptr<BaseRecvItem>& expectedItem);
+
   struct Connector;
 };
 

From 20bd1ec55b9cb12c03db05b2fad8df92fe0c1fc1 Mon Sep 17 00:00:00 2001
From: Qinghua Zhou <qinghuazhou@microsoft.com>
Date: Mon, 18 May 2026 21:44:20 +0000
Subject: [PATCH 12/12] ext/ep: fix CUDA 11.8 build + apply clang-format/black

- src/core/atomicadd_kernel.cu: restore the legacy 3-arg
  cuCtxCreate(&proxyAtomicCtx_, 0, cuDevice) in the
  '#else' branch of the CUDA_VERSION >= 12050 guard. A prior
  edit had corrupted it to 'cuCtxCreate(&proxyAtomicCtx_vice)',
  which broke the CUDA 11.8 build (CodeQL CUDA cuda11.8 and
  MSCCLPPLang cuda11.8 jobs).

- Apply clang-format to src/ext/ep/* (no logic changes,
  fixes the cpplint CI job).

- Apply black to test/python/ext/ep/test_internode_multirank.py
  and test_intranode_multirank.py (no logic changes, fixes
  the pylint CI job).
---
 src/core/atomicadd_kernel.cu                  |   8 +-
 src/ext/ep/buffer.cc                          | 104 +++++----
 src/ext/ep/buffer.hpp                         |   4 +-
 src/ext/ep/kernels/api.cuh                    |  21 +-
 src/ext/ep/kernels/buffer.cuh                 |   3 +-
 src/ext/ep/kernels/internode.cu               | 210 ++++++++----------
 src/ext/ep/kernels/internode_ll.cu            |  18 +-
 src/ext/ep/kernels/utils.cuh                  |   2 +-
 .../python/ext/ep/test_internode_multirank.py |   9 +-
 .../python/ext/ep/test_intranode_multirank.py |  12 +-
 10 files changed, 177 insertions(+), 214 deletions(-)

diff --git a/src/core/atomicadd_kernel.cu b/src/core/atomicadd_kernel.cu
index 21427df6..f4ae3b02 100644
--- a/src/core/atomicadd_kernel.cu
+++ b/src/core/atomicadd_kernel.cu
@@ -37,13 +37,13 @@ void CudaIpcStream::atomicAdd(uint64_t* dst, int64_t value) {
     CUresult res = cuDeviceGet(&cuDevice, deviceId_);
     if (res != CUDA_SUCCESS) throw Error("cuDeviceGet failed", ErrorCode::InternalError);
 
-    // cuCtxCreate added a `paramsArray` argument in CUDA 12.5 — use the
-    // 4-arg form on new toolkits, fall back to the legacy 3-arg form on
-    // CUDA < 12.5 so we keep compiling against older drivers/toolkits.
+      // cuCtxCreate added a `paramsArray` argument in CUDA 12.5 — use the
+      // 4-arg form on new toolkits, fall back to the legacy 3-arg form on
+      // CUDA < 12.5 so we keep compiling against older drivers/toolkits.
 #if CUDA_VERSION >= 12050
     res = cuCtxCreate_v4(&proxyAtomicCtx_, NULL, 0, cuDevice);
 #else
-    res = cuCtxCreate(&proxyAtomicCtx_vice);
+    res = cuCtxCreate(&proxyAtomicCtx_, 0, cuDevice);
 #endif
     if (res != CUDA_SUCCESS) throw Error("cuCtxCreate failed", ErrorCode::InternalError);
 
diff --git a/src/ext/ep/buffer.cc b/src/ext/ep/buffer.cc
index 02b38c36..f6e5870b 100644
--- a/src/ext/ep/buffer.cc
+++ b/src/ext/ep/buffer.cc
@@ -154,8 +154,7 @@ Buffer::Buffer(int rank, int num_ranks, int64_t num_nvl_bytes, int64_t num_rdma_
     if (v > 0 && v <= NUM_MAX_NVL_PEERS) local_world_size = v;
   }
   rdma_rank = rank / local_world_size, nvl_rank = rank % local_world_size;
-  num_rdma_ranks = std::max(1, num_ranks / local_world_size),
-  num_nvl_ranks = std::min(num_ranks, local_world_size);
+  num_rdma_ranks = std::max(1, num_ranks / local_world_size), num_nvl_ranks = std::min(num_ranks, local_world_size);
 
   // Get device info
   cudaDeviceProp device_prop = {};
@@ -404,8 +403,7 @@ void Buffer::sync(const std::vector<int>& device_ids,
     // worker ranks cannot reach.
     static const bool fabric_ipc_supported = resolve_fabric_ipc_supported();
     const bool use_fabric_ipc_alloc =
-        mscclpp::isNvlsSupported() && fabric_ipc_supported &&
-        (low_latency_mode || num_rdma_ranks > 1);
+        mscclpp::isNvlsSupported() && fabric_ipc_supported && (low_latency_mode || num_rdma_ranks > 1);
     if (use_fabric_ipc_alloc) {
       rdma_buffer_ptr = mscclpp::detail::gpuCallocPhysical(num_rdma_bytes);
       CUDA_CHECK(cudaMemset(rdma_buffer_ptr, 0, num_rdma_bytes));
@@ -415,8 +413,8 @@ void Buffer::sync(const std::vector<int>& device_ids,
     }
     if (rank == 0) {
       printf("[mscclpp_ep] rdma_buffer allocator: %s (low_latency=%d, nvls=%d, fabric_ipc=%d)\n",
-             use_fabric_ipc_alloc ? "gpuCallocPhysical (fabric-IPC)" : "cudaMalloc",
-             (int)low_latency_mode, (int)mscclpp::isNvlsSupported(), (int)fabric_ipc_supported);
+             use_fabric_ipc_alloc ? "gpuCallocPhysical (fabric-IPC)" : "cudaMalloc", (int)low_latency_mode,
+             (int)mscclpp::isNvlsSupported(), (int)fabric_ipc_supported);
       fflush(stdout);
     }
     bootstrap->barrier();
@@ -584,10 +582,8 @@ void Buffer::sync(const std::vector<int>& device_ids,
       // one slot per global rank (worst-case num_ranks = NUM_MAX_RDMA_PEERS *
       // NUM_MAX_NVL_PEERS). Each rank writes its own slot via `multimem.st`;
       // every receiver then reads the sub-position destined to it.
-      const size_t kPerSenderSlotBytes =
-          static_cast<size_t>(NUM_MAX_RDMA_PEERS) * kNvlsPerPeerBytes;
-      const size_t kMaxRanks =
-          static_cast<size_t>(NUM_MAX_RDMA_PEERS) * NUM_MAX_NVL_PEERS;
+      const size_t kPerSenderSlotBytes = static_cast<size_t>(NUM_MAX_RDMA_PEERS) * kNvlsPerPeerBytes;
+      const size_t kMaxRanks = static_cast<size_t>(NUM_MAX_RDMA_PEERS) * NUM_MAX_NVL_PEERS;
       const size_t data_bytes = kMaxRanks * kPerSenderSlotBytes;
 
       // 256 B alignment for each sub-region to keep `multimem` ops well-aligned.
@@ -612,8 +608,8 @@ void Buffer::sync(const std::vector<int>& device_ids,
       // propagates — there is no clean fallback mid-collective. The
       // `isNvlsSupported()` gate above is the production guard.
       nvls_ht_conn = mscclpp::connectNvlsCollective(communicator, all_ranks, nvls_ht_buffer->bytes());
-      auto sw = nvls_ht_conn->bindAllocatedMemory(
-          reinterpret_cast<CUdeviceptr>(nvls_ht_buffer->data()), nvls_ht_buffer->bytes());
+      auto sw = nvls_ht_conn->bindAllocatedMemory(reinterpret_cast<CUdeviceptr>(nvls_ht_buffer->data()),
+                                                  nvls_ht_buffer->bytes());
       nvls_ht_sc = std::make_shared<mscclpp::SwitchChannel>(std::move(sw));
       auto h = nvls_ht_sc->deviceHandle();
       nvls_ht_mc_ptr = h.mcPtr;
@@ -623,10 +619,9 @@ void Buffer::sync(const std::vector<int>& device_ids,
       // DIAG: print mcPtr/devicePtr/buf-VA per rank to verify whether
       // connectNvlsCollective produced a multicast that actually spans
       // both nodes (suspected: per-node only on Azure GB200).
-      printf(
-          "[mscclpp_ep] NVLS HT diag rank=%d mcPtr=%p devicePtr=%p bufVA=%p bytes=%zu\n",
-          rank, (void*)nvls_ht_mc_ptr, (void*)nvls_ht_dev_ptr,
-          (void*)nvls_ht_buffer->data(), (size_t)nvls_ht_buffer->bytes());
+      printf("[mscclpp_ep] NVLS HT diag rank=%d mcPtr=%p devicePtr=%p bufVA=%p bytes=%zu\n", rank,
+             (void*)nvls_ht_mc_ptr, (void*)nvls_ht_dev_ptr, (void*)nvls_ht_buffer->data(),
+             (size_t)nvls_ht_buffer->bytes());
       fflush(stdout);
 
       bootstrap->barrier();
@@ -635,16 +630,15 @@ void Buffer::sync(const std::vector<int>& device_ids,
         printf(
             "[mscclpp_ep] NVLS HT multicast: enabled=%d total=%zu KB "
             "(tail@%zu head@%zu barrier@%zu data@%zu)\n",
-            (int)nvls_ht_enabled, nvls_ht_total_bytes / 1024, nvls_ht_off_tail, nvls_ht_off_head,
-            nvls_ht_off_barrier, nvls_ht_off_data);
+            (int)nvls_ht_enabled, nvls_ht_total_bytes / 1024, nvls_ht_off_tail, nvls_ht_off_head, nvls_ht_off_barrier,
+            nvls_ht_off_data);
         fflush(stdout);
       }
     } else if (rank == 0) {
       printf(
           "[mscclpp_ep] NVLS HT multicast: disabled (low_latency=%d, num_rdma_ranks=%d, "
           "nvls_supported=%d, fabric_ipc=%d)\n",
-          (int)low_latency_mode, num_rdma_ranks, (int)mscclpp::isNvlsSupported(),
-          (int)fabric_ipc_supported);
+          (int)low_latency_mode, num_rdma_ranks, (int)mscclpp::isNvlsSupported(), (int)fabric_ipc_supported);
       fflush(stdout);
     }
 
@@ -717,8 +711,8 @@ void Buffer::sync(const std::vector<int>& device_ids,
         peer_rdma_bases[r] = remote_mems[r].data();
       }
       CUDA_CHECK(cudaMalloc(&peer_rdma_bases_gpu, sizeof(void*) * num_ranks));
-      CUDA_CHECK(cudaMemcpy(peer_rdma_bases_gpu, peer_rdma_bases.data(),
-                            sizeof(void*) * num_ranks, cudaMemcpyHostToDevice));
+      CUDA_CHECK(
+          cudaMemcpy(peer_rdma_bases_gpu, peer_rdma_bases.data(), sizeof(void*) * num_ranks, cudaMemcpyHostToDevice));
       if (rank == 0) {
         printf("[mscclpp_ep] Phase 4 fabric-IPC peer bases (rank 0):\n");
         for (int r = 0; r < num_ranks; ++r) {
@@ -1301,10 +1295,8 @@ Buffer::internode_dispatch(
                              buffer_ptrs_gpu, config.num_max_nvl_chunked_recv_tokens, task_fifo_ptrs_gpu, head, rank,
                              comm_stream, config.get_rdma_buffer_size_hint(hidden_int4 * sizeof(int4), num_ranks),
                              num_nvl_bytes, true, low_latency_mode, port_channel_handles_device_ptr.get(),
-                             memory_channel_handles_device_ptr.get(),
-                             nvls_ht_enabled ? nvls_ht_mc_ptr : nullptr,
-                             nvls_ht_enabled ? nvls_ht_dev_ptr : nullptr,
-                             nvls_ht_off_barrier, nvls_ht_cached_epoch);
+                             memory_channel_handles_device_ptr.get(), nvls_ht_enabled ? nvls_ht_mc_ptr : nullptr,
+                             nvls_ht_enabled ? nvls_ht_dev_ptr : nullptr, nvls_ht_off_barrier, nvls_ht_cached_epoch);
     move_fifo_slots(2);
   } else {
     rdma_channel_prefix_matrix =
@@ -1329,9 +1321,8 @@ Buffer::internode_dispatch(
         buffer_ptrs_gpu, config.num_max_nvl_chunked_recv_tokens, task_fifo_ptrs_gpu, head, rank, comm_stream,
         config.get_rdma_buffer_size_hint(hidden_int4 * sizeof(int4), num_ranks), num_nvl_bytes, low_latency_mode,
         port_channel_handles_device_ptr.get(), memory_channel_handles_device_ptr.get(),
-        nvls_ht_enabled ? nvls_ht_mc_ptr : nullptr,
-        nvls_ht_enabled ? nvls_ht_dev_ptr : nullptr,
-        nvls_ht_off_barrier, nvls_ht_off_data, nvls_ht_epoch, kNvlsPerPeerBytes);
+        nvls_ht_enabled ? nvls_ht_mc_ptr : nullptr, nvls_ht_enabled ? nvls_ht_dev_ptr : nullptr, nvls_ht_off_barrier,
+        nvls_ht_off_data, nvls_ht_epoch, kNvlsPerPeerBytes);
     move_fifo_slots(3);
 
     // Synchronize total received tokens and tokens per expert
@@ -1396,10 +1387,14 @@ Buffer::internode_dispatch(
   // Phase 3: pass NVLS counter region pointers (head/tail × mc/dev). When
   // `nvls_ht_enabled` is false, all four are nullptr and the kernel falls
   // back to the legacy PortChannel/atomicAdd path.
-  void* nvls_head_mc = nvls_ht_enabled ? static_cast<void*>(static_cast<char*>(nvls_ht_mc_ptr) + nvls_ht_off_head) : nullptr;
-  void* nvls_head_dev = nvls_ht_enabled ? static_cast<void*>(static_cast<char*>(nvls_ht_dev_ptr) + nvls_ht_off_head) : nullptr;
-  void* nvls_tail_mc = nvls_ht_enabled ? static_cast<void*>(static_cast<char*>(nvls_ht_mc_ptr) + nvls_ht_off_tail) : nullptr;
-  void* nvls_tail_dev = nvls_ht_enabled ? static_cast<void*>(static_cast<char*>(nvls_ht_dev_ptr) + nvls_ht_off_tail) : nullptr;
+  void* nvls_head_mc =
+      nvls_ht_enabled ? static_cast<void*>(static_cast<char*>(nvls_ht_mc_ptr) + nvls_ht_off_head) : nullptr;
+  void* nvls_head_dev =
+      nvls_ht_enabled ? static_cast<void*>(static_cast<char*>(nvls_ht_dev_ptr) + nvls_ht_off_head) : nullptr;
+  void* nvls_tail_mc =
+      nvls_ht_enabled ? static_cast<void*>(static_cast<char*>(nvls_ht_mc_ptr) + nvls_ht_off_tail) : nullptr;
+  void* nvls_tail_dev =
+      nvls_ht_enabled ? static_cast<void*>(static_cast<char*>(nvls_ht_dev_ptr) + nvls_ht_off_tail) : nullptr;
   internode::dispatch(recv_x.data_ptr(), recv_x_scales_ptr, recv_topk_idx_ptr, recv_topk_weights_ptr,
                       cached_mode ? nullptr : recv_src_meta->data_ptr(), x.data_ptr(), x_scales_ptr, topk_idx_ptr,
                       topk_weights_ptr, cached_mode ? nullptr : send_rdma_head->data_ptr<int>(),
@@ -1412,9 +1407,8 @@ Buffer::internode_dispatch(
                       rdma_buffer_ptr, config.num_max_rdma_chunked_send_tokens, config.num_max_rdma_chunked_recv_tokens,
                       buffer_ptrs_gpu, config.num_max_nvl_chunked_send_tokens, config.num_max_nvl_chunked_recv_tokens,
                       rank, num_ranks, cached_mode, comm_stream, num_channels, low_latency_mode,
-                      port_channel_handles_device_ptr.get(), memory_channel_handles_device_ptr.get(),
-                      nvls_head_mc, nvls_head_dev, nvls_tail_mc, nvls_tail_dev,
-                      peer_rdma_bases_gpu);
+                      port_channel_handles_device_ptr.get(), memory_channel_handles_device_ptr.get(), nvls_head_mc,
+                      nvls_head_dev, nvls_tail_mc, nvls_tail_dev, peer_rdma_bases_gpu);
 
   // Wait streams
   std::optional<EventHandle> event;
@@ -1531,28 +1525,30 @@ std::tuple<torch::Tensor, std::optional<torch::Tensor>, std::optional<EventHandl
       config.num_max_nvl_chunked_recv_tokens, task_fifo_ptrs_gpu, head, rank, comm_stream,
       config.get_rdma_buffer_size_hint(hidden_int4 * sizeof(int4), num_ranks), num_nvl_bytes, false, low_latency_mode,
       port_channel_handles_device_ptr.get(), memory_channel_handles_device_ptr.get(),
-      nvls_ht_enabled ? nvls_ht_mc_ptr : nullptr,
-      nvls_ht_enabled ? nvls_ht_dev_ptr : nullptr,
-      nvls_ht_off_barrier, (nvls_ht_enabled ? ++nvls_ht_cached_epoch : 0));
+      nvls_ht_enabled ? nvls_ht_mc_ptr : nullptr, nvls_ht_enabled ? nvls_ht_dev_ptr : nullptr, nvls_ht_off_barrier,
+      (nvls_ht_enabled ? ++nvls_ht_cached_epoch : 0));
   move_fifo_slots(2);
 
   auto combined_x = torch::empty({num_combined_tokens, hidden}, x.options());
   // Phase 3: NVLS counter region pointers for combine kernel.
-  void* combine_nvls_head_mc = nvls_ht_enabled ? static_cast<void*>(static_cast<char*>(nvls_ht_mc_ptr) + nvls_ht_off_head) : nullptr;
-  void* combine_nvls_head_dev = nvls_ht_enabled ? static_cast<void*>(static_cast<char*>(nvls_ht_dev_ptr) + nvls_ht_off_head) : nullptr;
-  void* combine_nvls_tail_mc = nvls_ht_enabled ? static_cast<void*>(static_cast<char*>(nvls_ht_mc_ptr) + nvls_ht_off_tail) : nullptr;
-  void* combine_nvls_tail_dev = nvls_ht_enabled ? static_cast<void*>(static_cast<char*>(nvls_ht_dev_ptr) + nvls_ht_off_tail) : nullptr;
-  internode::combine(at::cuda::ScalarTypeToCudaDataType(x.scalar_type()), combined_x.data_ptr(),
-                     combined_topk_weights_ptr, is_combined_token_in_rank.data_ptr<bool>(), x.data_ptr(),
-                     topk_weights_ptr, combined_rdma_head.data_ptr<int>(), combined_nvl_head.data_ptr<int>(),
-                     src_meta.data_ptr(), rdma_channel_prefix_matrix.data_ptr<int>(),
-                     rdma_rank_prefix_sum.data_ptr<int>(), gbl_channel_prefix_matrix.data_ptr<int>(), num_tokens,
-                     num_combined_tokens, hidden, num_topk, rdma_buffer_ptr, config.num_max_rdma_chunked_send_tokens,
-                     config.num_max_rdma_chunked_recv_tokens, buffer_ptrs_gpu, config.num_max_nvl_chunked_send_tokens,
-                     config.num_max_nvl_chunked_recv_tokens, rank, num_ranks, comm_stream, num_channels,
-                     low_latency_mode, port_channel_handles_device_ptr.get(), memory_channel_handles_device_ptr.get(),
-                     combine_nvls_head_mc, combine_nvls_head_dev, combine_nvls_tail_mc, combine_nvls_tail_dev,
-                     peer_rdma_bases_gpu);
+  void* combine_nvls_head_mc =
+      nvls_ht_enabled ? static_cast<void*>(static_cast<char*>(nvls_ht_mc_ptr) + nvls_ht_off_head) : nullptr;
+  void* combine_nvls_head_dev =
+      nvls_ht_enabled ? static_cast<void*>(static_cast<char*>(nvls_ht_dev_ptr) + nvls_ht_off_head) : nullptr;
+  void* combine_nvls_tail_mc =
+      nvls_ht_enabled ? static_cast<void*>(static_cast<char*>(nvls_ht_mc_ptr) + nvls_ht_off_tail) : nullptr;
+  void* combine_nvls_tail_dev =
+      nvls_ht_enabled ? static_cast<void*>(static_cast<char*>(nvls_ht_dev_ptr) + nvls_ht_off_tail) : nullptr;
+  internode::combine(
+      at::cuda::ScalarTypeToCudaDataType(x.scalar_type()), combined_x.data_ptr(), combined_topk_weights_ptr,
+      is_combined_token_in_rank.data_ptr<bool>(), x.data_ptr(), topk_weights_ptr, combined_rdma_head.data_ptr<int>(),
+      combined_nvl_head.data_ptr<int>(), src_meta.data_ptr(), rdma_channel_prefix_matrix.data_ptr<int>(),
+      rdma_rank_prefix_sum.data_ptr<int>(), gbl_channel_prefix_matrix.data_ptr<int>(), num_tokens, num_combined_tokens,
+      hidden, num_topk, rdma_buffer_ptr, config.num_max_rdma_chunked_send_tokens,
+      config.num_max_rdma_chunked_recv_tokens, buffer_ptrs_gpu, config.num_max_nvl_chunked_send_tokens,
+      config.num_max_nvl_chunked_recv_tokens, rank, num_ranks, comm_stream, num_channels, low_latency_mode,
+      port_channel_handles_device_ptr.get(), memory_channel_handles_device_ptr.get(), combine_nvls_head_mc,
+      combine_nvls_head_dev, combine_nvls_tail_mc, combine_nvls_tail_dev, peer_rdma_bases_gpu);
 
   std::optional<EventHandle> event;
   if (async) {
diff --git a/src/ext/ep/buffer.hpp b/src/ext/ep/buffer.hpp
index 36453af5..e9749847 100644
--- a/src/ext/ep/buffer.hpp
+++ b/src/ext/ep/buffer.hpp
@@ -155,8 +155,8 @@ struct Buffer {
   // Worst-case shape parameters used to size the buffer:
   //   stride_per_channel = num_rdma_ranks * num_rdma_ranks (counter slots)
   // We allocate for `kNvlsMaxChannels` so any `num_sms` config fits.
-  static constexpr int kNvlsMaxChannels = 64;       // num_sms / 2 upper bound
-  static constexpr int kNvlsPerPeerBytes = 1024;    // small-data per (sender, receiver) pair
+  static constexpr int kNvlsMaxChannels = 64;     // num_sms / 2 upper bound
+  static constexpr int kNvlsPerPeerBytes = 1024;  // small-data per (sender, receiver) pair
   // Number of distinct barrier slots in the barrier sub-region (each u64).
   static constexpr int kNvlsBarrierSlots = 8;
 
diff --git a/src/ext/ep/kernels/api.cuh b/src/ext/ep/kernels/api.cuh
index d728faa9..c47f2422 100644
--- a/src/ext/ep/kernels/api.cuh
+++ b/src/ext/ep/kernels/api.cuh
@@ -77,10 +77,8 @@ void notify_dispatch(const int* num_tokens_per_rank, int* moe_recv_counter_mappe
                      int num_max_nvl_chunked_recv_tokens, int** task_fifo_ptrs, int head, int rank, cudaStream_t stream,
                      int64_t num_rdma_bytes, int64_t num_nvl_bytes, bool low_latency_mode,
                      mscclpp::PortChannelDeviceHandle* port_channel_handles,
-                     mscclpp::MemoryChannelDeviceHandle* memory_channel_handles,
-                     void* nvls_mc_ptr, void* nvls_dev_ptr,
-                     size_t nvls_off_barrier, size_t nvls_off_data,
-                     uint64_t nvls_epoch, int nvls_per_peer_bytes);
+                     mscclpp::MemoryChannelDeviceHandle* memory_channel_handles, void* nvls_mc_ptr, void* nvls_dev_ptr,
+                     size_t nvls_off_barrier, size_t nvls_off_data, uint64_t nvls_epoch, int nvls_per_peer_bytes);
 
 void dispatch(void* recv_x, float* recv_x_scales, int64_t* recv_topk_idx, float* recv_topk_weights, void* recv_src_meta,
               const void* x, const float* x_scales, const int64_t* topk_idx, const float* topk_weights,
@@ -93,9 +91,8 @@ void dispatch(void* recv_x, float* recv_x_scales, int64_t* recv_topk_idx, float*
               int num_max_nvl_chunked_send_tokens, int num_max_nvl_chunked_recv_tokens, int rank, int num_ranks,
               bool is_cached_dispatch, cudaStream_t stream, int num_channels, bool low_latency_mode,
               mscclpp::PortChannelDeviceHandle* port_channel_handles,
-              mscclpp::MemoryChannelDeviceHandle* memory_channel_handles,
-              void* nvls_head_mc, void* nvls_head_dev, void* nvls_tail_mc, void* nvls_tail_dev,
-              void* const* peer_rdma_bases);
+              mscclpp::MemoryChannelDeviceHandle* memory_channel_handles, void* nvls_head_mc, void* nvls_head_dev,
+              void* nvls_tail_mc, void* nvls_tail_dev, void* const* peer_rdma_bases);
 
 void cached_notify(int hidden_int4, int num_scales, int num_topk_idx, int num_topk_weights, int num_ranks,
                    int num_channels, int num_combined_tokens, int* combined_rdma_head,
@@ -104,9 +101,8 @@ void cached_notify(int hidden_int4, int num_scales, int num_topk_idx, int num_to
                    int num_max_nvl_chunked_recv_tokens, int** task_fifo_ptrs, int head, int rank, cudaStream_t stream,
                    int64_t num_rdma_bytes, int64_t num_nvl_bytes, bool is_cached_dispatch, bool low_latency_mode,
                    mscclpp::PortChannelDeviceHandle* port_channel_handles,
-                   mscclpp::MemoryChannelDeviceHandle* memory_channel_handles,
-                   void* nvls_mc_ptr = nullptr, void* nvls_dev_ptr = nullptr,
-                   size_t nvls_off_barrier = 0, uint64_t nvls_epoch = 0);
+                   mscclpp::MemoryChannelDeviceHandle* memory_channel_handles, void* nvls_mc_ptr = nullptr,
+                   void* nvls_dev_ptr = nullptr, size_t nvls_off_barrier = 0, uint64_t nvls_epoch = 0);
 
 void combine(cudaDataType_t type, void* combined_x, float* combined_topk_weights, const bool* is_combined_token_in_rank,
              const void* x, const float* topk_weights, const int* combined_rdma_head, const int* combined_nvl_head,
@@ -116,9 +112,8 @@ void combine(cudaDataType_t type, void* combined_x, float* combined_topk_weights
              void** buffer_ptrs, int num_max_nvl_chunked_send_tokens, int num_max_nvl_chunked_recv_tokens, int rank,
              int num_ranks, cudaStream_t stream, int num_channels, bool low_latency_mode,
              mscclpp::PortChannelDeviceHandle* port_channel_handles,
-             mscclpp::MemoryChannelDeviceHandle* memory_channel_handles,
-             void* nvls_head_mc, void* nvls_head_dev, void* nvls_tail_mc, void* nvls_tail_dev,
-             void* const* peer_rdma_bases);
+             mscclpp::MemoryChannelDeviceHandle* memory_channel_handles, void* nvls_head_mc, void* nvls_head_dev,
+             void* nvls_tail_mc, void* nvls_tail_dev, void* const* peer_rdma_bases);
 
 }  // namespace internode
 
diff --git a/src/ext/ep/kernels/buffer.cuh b/src/ext/ep/kernels/buffer.cuh
index e006969d..98505c3c 100644
--- a/src/ext/ep/kernels/buffer.cuh
+++ b/src/ext/ep/kernels/buffer.cuh
@@ -2,9 +2,10 @@
 // Licensed under the MIT License.
 #pragma once
 
+#include <cstdint>
+
 #include "configs.cuh"
 #include "exception.cuh"
-#include <cstdint>
 
 namespace mscclpp {
 namespace ep {
diff --git a/src/ext/ep/kernels/internode.cu b/src/ext/ep/kernels/internode.cu
index 2b3dea6d..80ffc0f3 100644
--- a/src/ext/ep/kernels/internode.cu
+++ b/src/ext/ep/kernels/internode.cu
@@ -1,9 +1,9 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT License.
 #include <limits>
-#include <type_traits>
 #include <mscclpp/memory_channel_device.hpp>
 #include <mscclpp/port_channel_device.hpp>
+#include <type_traits>
 
 #include "buffer.cuh"
 #include "configs.cuh"
@@ -22,8 +22,7 @@ namespace ep {
 using NvlPackT = std::conditional_t<NUM_MAX_NVL_PEERS == 8, uint64_t, uint32_t>;
 static_assert(NUM_MAX_NVL_PEERS == 8 || NUM_MAX_NVL_PEERS == 4,
               "NUM_MAX_NVL_PEERS must be 4 or 8 for HT internode kernel");
-static_assert(NUM_MAX_NVL_PEERS * sizeof(bool) == sizeof(NvlPackT),
-              "NvlPackT size must match NUM_MAX_NVL_PEERS bools");
+static_assert(NUM_MAX_NVL_PEERS * sizeof(bool) == sizeof(NvlPackT), "NvlPackT size must match NUM_MAX_NVL_PEERS bools");
 
 namespace internode {
 
@@ -273,23 +272,19 @@ __host__ __device__ __forceinline__ std::pair<int, int> get_nvl_clean_meta(int h
 }
 
 template <bool kLowLatencyMode, int kNumRDMARanks>
-__global__ void notify_dispatch(const int* num_tokens_per_rank, int* moe_recv_counter_mapped, int num_ranks,
-                                const int* num_tokens_per_rdma_rank, int* moe_recv_rdma_counter_mapped,
-                                const int* num_tokens_per_expert, int* moe_recv_expert_counter_mapped, int num_experts,
-                                const bool* is_token_in_rank, int num_tokens, int num_channels, int expert_alignment,
-                                const int rdma_clean_offset, const int rdma_num_int_clean, const int nvl_clean_offset,
-                                const int nvl_num_int_clean, int* rdma_channel_prefix_matrix,
-                                int* recv_rdma_rank_prefix_sum, int* gbl_channel_prefix_matrix,
-                                int* recv_gbl_rank_prefix_sum, void* rdma_buffer_ptr, void** buffer_ptrs,
-                                int** task_fifo_ptrs, int head, int rank,
-                                mscclpp::PortChannelDeviceHandle* port_channel_handles,
-                                mscclpp::MemoryChannelDeviceHandle* memory_channel_handles,
-                                // NVLS Phase 2 — replaces port_channel signal/wait + putWithSignal.
-                                // When `nvls_mc_ptr == nullptr` the legacy PortChannel path runs
-                                // unchanged (fallback for non-NVLS IB platforms).
-                                void* nvls_mc_ptr, void* nvls_dev_ptr,
-                                size_t nvls_off_barrier, size_t nvls_off_data,
-                                uint64_t nvls_epoch, int nvls_per_peer_bytes) {
+__global__ void notify_dispatch(
+    const int* num_tokens_per_rank, int* moe_recv_counter_mapped, int num_ranks, const int* num_tokens_per_rdma_rank,
+    int* moe_recv_rdma_counter_mapped, const int* num_tokens_per_expert, int* moe_recv_expert_counter_mapped,
+    int num_experts, const bool* is_token_in_rank, int num_tokens, int num_channels, int expert_alignment,
+    const int rdma_clean_offset, const int rdma_num_int_clean, const int nvl_clean_offset, const int nvl_num_int_clean,
+    int* rdma_channel_prefix_matrix, int* recv_rdma_rank_prefix_sum, int* gbl_channel_prefix_matrix,
+    int* recv_gbl_rank_prefix_sum, void* rdma_buffer_ptr, void** buffer_ptrs, int** task_fifo_ptrs, int head, int rank,
+    mscclpp::PortChannelDeviceHandle* port_channel_handles, mscclpp::MemoryChannelDeviceHandle* memory_channel_handles,
+    // NVLS Phase 2 — replaces port_channel signal/wait + putWithSignal.
+    // When `nvls_mc_ptr == nullptr` the legacy PortChannel path runs
+    // unchanged (fallback for non-NVLS IB platforms).
+    void* nvls_mc_ptr, void* nvls_dev_ptr, size_t nvls_off_barrier, size_t nvls_off_data, uint64_t nvls_epoch,
+    int nvls_per_peer_bytes) {
   auto sm_id = static_cast<int>(blockIdx.x);
   auto thread_id = static_cast<int>(threadIdx.x), warp_id = thread_id / 32, lane_id = get_lane_id();
   auto num_threads = static_cast<int>(blockDim.x), num_warps = num_threads / 32;
@@ -397,9 +392,8 @@ __global__ void notify_dispatch(const int* num_tokens_per_rank, int* moe_recv_co
       // the local send region — contiguous from send_buffer(0).
       int* src_ints = rdma_recv_num_tokens_mixed.send_buffer(0);
       const int total_ints = num_elems * kNumRDMARanks;
-      int* mc_slot = reinterpret_cast<int*>(
-          static_cast<char*>(nvls_mc_ptr) + nvls_off_data +
-          static_cast<size_t>(my_global_rank) * slot_stride_bytes);
+      int* mc_slot = reinterpret_cast<int*>(static_cast<char*>(nvls_mc_ptr) + nvls_off_data +
+                                            static_cast<size_t>(my_global_rank) * slot_stride_bytes);
       for (int i = thread_id; i < total_ints; i += num_threads) {
         int val = src_ints[i];
         MSCCLPP_EP_MULTIMEM_ST_RELAXED_U32(mc_slot + i, val);
@@ -425,10 +419,9 @@ __global__ void notify_dispatch(const int* num_tokens_per_rank, int* moe_recv_co
       // the legacy SymBuffer recv_buffer(s) location.
       for (int s = 0; s < kNumRDMARanks; ++s) {
         const int sender_global = kLowLatencyMode ? s : (s * NUM_MAX_NVL_PEERS + nvl_rank);
-        const int* nvls_src = reinterpret_cast<const int*>(
-            static_cast<char*>(nvls_dev_ptr) + nvls_off_data +
-            static_cast<size_t>(sender_global) * slot_stride_bytes +
-            static_cast<size_t>(rdma_rank) * num_bytes);
+        const int* nvls_src = reinterpret_cast<const int*>(static_cast<char*>(nvls_dev_ptr) + nvls_off_data +
+                                                           static_cast<size_t>(sender_global) * slot_stride_bytes +
+                                                           static_cast<size_t>(rdma_rank) * num_bytes);
         int* dst = rdma_recv_num_tokens_mixed.recv_buffer(s);
         for (int i = thread_id; i < num_elems; i += num_threads) {
           dst[i] = nvls_src[i];
@@ -622,10 +615,8 @@ void notify_dispatch(const int* num_tokens_per_rank, int* moe_recv_counter_mappe
                      int num_max_nvl_chunked_recv_tokens, int** task_fifo_ptrs, int head, int rank, cudaStream_t stream,
                      int64_t num_rdma_bytes, int64_t num_nvl_bytes, bool low_latency_mode,
                      mscclpp::PortChannelDeviceHandle* port_channel_handles,
-                     mscclpp::MemoryChannelDeviceHandle* memory_channel_handles,
-                     void* nvls_mc_ptr, void* nvls_dev_ptr,
-                     size_t nvls_off_barrier, size_t nvls_off_data,
-                     uint64_t nvls_epoch, int nvls_per_peer_bytes) {
+                     mscclpp::MemoryChannelDeviceHandle* memory_channel_handles, void* nvls_mc_ptr, void* nvls_dev_ptr,
+                     size_t nvls_off_barrier, size_t nvls_off_data, uint64_t nvls_epoch, int nvls_per_peer_bytes) {
 #define NOTIFY_DISPATCH_LAUNCH_CASE(num_rdma_ranks)                                                                  \
   {                                                                                                                  \
     auto notify_dispatch_func =                                                                                      \
@@ -636,8 +627,8 @@ void notify_dispatch(const int* num_tokens_per_rank, int* moe_recv_counter_mappe
                   expert_alignment, rdma_clean_meta.first, rdma_clean_meta.second, nvl_clean_meta.first,             \
                   nvl_clean_meta.second, rdma_channel_prefix_matrix, recv_rdma_rank_prefix_sum,                      \
                   gbl_channel_prefix_matrix, recv_gbl_rank_prefix_sum, rdma_buffer_ptr, buffer_ptrs, task_fifo_ptrs, \
-                  head, rank, port_channel_handles, memory_channel_handles,                                          \
-                  nvls_mc_ptr, nvls_dev_ptr, nvls_off_barrier, nvls_off_data, nvls_epoch, nvls_per_peer_bytes);      \
+                  head, rank, port_channel_handles, memory_channel_handles, nvls_mc_ptr, nvls_dev_ptr,               \
+                  nvls_off_barrier, nvls_off_data, nvls_epoch, nvls_per_peer_bytes);                                 \
   }                                                                                                                  \
   break
 
@@ -856,10 +847,8 @@ __global__ void __launch_bounds__(((kNumDispatchRDMASenderWarps + 1 + NUM_MAX_NV
         const auto dst_offset = rdma_rank * num_bytes + meta_recv_offset;
         const auto src_offset = dst_rdma_rank * num_bytes + meta_send_offset;
         const int dst_rank_global = dst_rdma_rank * NUM_MAX_NVL_PEERS + nvl_rank;
-        const int* src_p = reinterpret_cast<const int*>(
-            reinterpret_cast<uint8_t*>(rdma_buffer_ptr_base) + src_offset);
-        int* dst_p = reinterpret_cast<int*>(
-            reinterpret_cast<uint8_t*>(peer_rdma_bases[dst_rank_global]) + dst_offset);
+        const int* src_p = reinterpret_cast<const int*>(reinterpret_cast<uint8_t*>(rdma_buffer_ptr_base) + src_offset);
+        int* dst_p = reinterpret_cast<int*>(reinterpret_cast<uint8_t*>(peer_rdma_bases[dst_rank_global]) + dst_offset);
         const int n_int = (int)(num_bytes / sizeof(int));
         for (int k = lane_id; k < n_int; k += 32) {
           dst_p[k] = src_p[k];
@@ -904,8 +893,7 @@ __global__ void __launch_bounds__(((kNumDispatchRDMASenderWarps + 1 + NUM_MAX_NV
           // Phase 4: head feedback path \u2014 cross-node uses fabric-VA store,
           // self-loop uses local atomic. Both end up in rdma_channel_head;
           // single read via ld_volatile_global covers them. NVLS removed.
-          cached_rdma_channel_head =
-              static_cast<int>(ld_volatile_global(rdma_channel_head.buffer(lane_id)));
+          cached_rdma_channel_head = static_cast<int>(ld_volatile_global(rdma_channel_head.buffer(lane_id)));
         }
       }
       __syncwarp();
@@ -991,8 +979,7 @@ __global__ void __launch_bounds__(((kNumDispatchRDMASenderWarps + 1 + NUM_MAX_NV
     __syncwarp();
 
     // Update last token tail (epilogue). See in-loop note on atomicMax.
-    if (last_rdma_tail_idx >= 0)
-      atomicMax(const_cast<int*>(rdma_send_channel_tail + lane_id), last_rdma_tail_idx + 1);
+    if (last_rdma_tail_idx >= 0) atomicMax(const_cast<int*>(rdma_send_channel_tail + lane_id), last_rdma_tail_idx + 1);
 
     // Release sequential lock
     lane_id == 0 ? (rdma_send_next_token_idx += 1) : 0;
@@ -1035,8 +1022,7 @@ __global__ void __launch_bounds__(((kNumDispatchRDMASenderWarps + 1 + NUM_MAX_NV
         if (owner && num_tokens_to_send > 0) {
           auto processed_tail = ld_acquire_cta(const_cast<const int*>(rdma_send_channel_tail + dst_rdma_rank));
           auto num_tokens_processed = processed_tail - last_issued_tail;
-          if (num_tokens_processed == num_tokens_to_send ||
-              num_tokens_processed >= num_max_rdma_chunked_send_tokens) {
+          if (num_tokens_processed == num_tokens_to_send || num_tokens_processed >= num_max_rdma_chunked_send_tokens) {
             int n = min(num_tokens_processed, num_max_rdma_chunked_send_tokens);
             EP_DEVICE_ASSERT(n >= 0 && n <= num_tokens_to_send);
             my_num_tokens_to_issue = n;
@@ -1067,10 +1053,10 @@ __global__ void __launch_bounds__(((kNumDispatchRDMASenderWarps + 1 + NUM_MAX_NV
               // ordering pair (release semantics on multimem.red triggers
               // unspecified launch failure on Azure GB200).
               const int dst_rank_global = dst_rdma_rank * NUM_MAX_NVL_PEERS + nvl_rank;
-              const int4* src_p = reinterpret_cast<const int4*>(
-                  reinterpret_cast<uint8_t*>(rdma_buffer_ptr_base) + src_offset);
-              int4* dst_p = reinterpret_cast<int4*>(
-                  reinterpret_cast<uint8_t*>(peer_rdma_bases[dst_rank_global]) + dst_offset);
+              const int4* src_p =
+                  reinterpret_cast<const int4*>(reinterpret_cast<uint8_t*>(rdma_buffer_ptr_base) + src_offset);
+              int4* dst_p =
+                  reinterpret_cast<int4*>(reinterpret_cast<uint8_t*>(peer_rdma_bases[dst_rank_global]) + dst_offset);
               const int n_int4 = (int)(num_bytes_per_msg / sizeof(int4));
               // Unrolled 8x to give the LSU pipeline more outstanding stores
               // per lane. Each lane handles k, k+32, ..., k+224 per iter
@@ -1120,26 +1106,24 @@ __global__ void __launch_bounds__(((kNumDispatchRDMASenderWarps + 1 + NUM_MAX_NV
           if (owner) {
             if (peer_rdma_bases != nullptr && dst_rdma_rank != rdma_rank) {
               const int dst_rank_global = dst_rdma_rank * NUM_MAX_NVL_PEERS + nvl_rank;
-              const uintptr_t my_tail_off =
-                  reinterpret_cast<uintptr_t>(rdma_channel_tail.buffer(rdma_rank)) -
-                  reinterpret_cast<uintptr_t>(rdma_buffer_ptr_base);
+              const uintptr_t my_tail_off = reinterpret_cast<uintptr_t>(rdma_channel_tail.buffer(rdma_rank)) -
+                                            reinterpret_cast<uintptr_t>(rdma_buffer_ptr_base);
               uint64_t* peer_tail = reinterpret_cast<uint64_t*>(
                   reinterpret_cast<uint8_t*>(peer_rdma_bases[dst_rank_global]) + my_tail_off);
               const uint64_t new_tail = (uint64_t)issue_tail + (uint64_t)n_issue;
-              asm volatile("st.release.sys.global.u64 [%0], %1;" :: "l"(peer_tail), "l"(new_tail) : "memory");
+              asm volatile("st.release.sys.global.u64 [%0], %1;" ::"l"(peer_tail), "l"(new_tail) : "memory");
             } else {
               // Self-loop: plain release atomic on local slot (no multicast).
-              mscclpp::atomicFetchAdd(
-                  reinterpret_cast<uint64_t*>(rdma_channel_tail.buffer(rdma_rank)),
-                  (uint64_t)n_issue, mscclpp::memoryOrderRelease);
+              mscclpp::atomicFetchAdd(reinterpret_cast<uint64_t*>(rdma_channel_tail.buffer(rdma_rank)),
+                                      (uint64_t)n_issue, mscclpp::memoryOrderRelease);
             }
           }
         } else if (owner) {
           // Legacy non-NVLS path (single-lane).
           if (dst_rdma_rank == rdma_rank) {
             // Update tails
-            mscclpp::atomicFetchAdd(reinterpret_cast<uint64_t*>(rdma_channel_tail.buffer(rdma_rank)),
-                                    (uint64_t)n_issue, mscclpp::memoryOrderRelease);
+            mscclpp::atomicFetchAdd(reinterpret_cast<uint64_t*>(rdma_channel_tail.buffer(rdma_rank)), (uint64_t)n_issue,
+                                    mscclpp::memoryOrderRelease);
           } else {
             const auto dst_slot_idx = issue_tail % num_max_rdma_chunked_recv_tokens;
             const size_t num_bytes_per_msg = (size_t)num_bytes_per_rdma_token * (size_t)n_issue;
@@ -1147,9 +1131,9 @@ __global__ void __launch_bounds__(((kNumDispatchRDMASenderWarps + 1 + NUM_MAX_NV
                                     dst_slot_idx * num_bytes_per_rdma_token + data_recv_offset;
             const auto src_offset = dst_rdma_rank * (num_max_rdma_chunked_recv_tokens * num_bytes_per_rdma_token) +
                                     dst_slot_idx * num_bytes_per_rdma_token + data_send_offset;
-            const auto port_channel_idx =
-                kLowLatencyMode ? (channel_id * kNumRDMARanks + dst_rdma_rank)
-                                : (channel_id * num_ranks + dst_rdma_rank * NUM_MAX_NVL_PEERS + nvl_rank);
+            const auto port_channel_idx = kLowLatencyMode
+                                              ? (channel_id * kNumRDMARanks + dst_rdma_rank)
+                                              : (channel_id * num_ranks + dst_rdma_rank * NUM_MAX_NVL_PEERS + nvl_rank);
             auto& handle = port_channel_handles[port_channel_idx];
             handle.put(dst_offset, src_offset, num_bytes_per_msg);
 
@@ -1159,9 +1143,8 @@ __global__ void __launch_bounds__(((kNumDispatchRDMASenderWarps + 1 + NUM_MAX_NV
             const uint64_t new_tail = (uint64_t)issue_tail + (uint64_t)n_issue;
             *rdma_channel_tail_send_src.buffer(dst_rdma_rank) = new_tail;
             __threadfence_system();
-            const auto src_off_tail =
-                reinterpret_cast<uintptr_t>(rdma_channel_tail_send_src.buffer(dst_rdma_rank)) -
-                reinterpret_cast<uintptr_t>(rdma_buffer_ptr_base);
+            const auto src_off_tail = reinterpret_cast<uintptr_t>(rdma_channel_tail_send_src.buffer(dst_rdma_rank)) -
+                                      reinterpret_cast<uintptr_t>(rdma_buffer_ptr_base);
             handle.put(rdma_rank * sizeof(uint64_t) + tail_send_offset, src_off_tail, sizeof(uint64_t));
           }
         }
@@ -1271,8 +1254,7 @@ __global__ void __launch_bounds__(((kNumDispatchRDMASenderWarps + 1 + NUM_MAX_NV
             // (sender writes peer's rdma_channel_tail slot). Self-loop tail
             // is a plain local atomic. Both end up in rdma_channel_tail —
             // single read path via ld_acquire_sys_global covers them.
-            cached_rdma_channel_tail =
-                static_cast<int>(ld_acquire_sys_global(rdma_channel_tail.buffer(src_rdma_rank)));
+            cached_rdma_channel_tail = static_cast<int>(ld_acquire_sys_global(rdma_channel_tail.buffer(src_rdma_rank)));
           }
           if (__shfl_sync(0xffffffff, cached_rdma_channel_tail > cached_rdma_channel_head, src_rdma_rank)) break;
         }
@@ -1394,9 +1376,8 @@ __global__ void __launch_bounds__(((kNumDispatchRDMASenderWarps + 1 + NUM_MAX_NV
       // last chunks to never trigger head feedback, which deadlocked at
       // larger chunk_send values where 4096 tokens / 10 channels / 2 peers
       // ≈ 205 tokens ⇒ 3 full chunks + 1 partial.
-      const bool any_retired = forward_channel_retired[0] || forward_channel_retired[1] ||
-                               forward_channel_retired[2] || forward_channel_retired[3] ||
-                               forward_channel_retired[4] || forward_channel_retired[5] ||
+      const bool any_retired = forward_channel_retired[0] || forward_channel_retired[1] || forward_channel_retired[2] ||
+                               forward_channel_retired[3] || forward_channel_retired[4] || forward_channel_retired[5] ||
                                forward_channel_retired[6] || forward_channel_retired[7];
       const int head_update_threshold = any_retired ? 1 : max(1, num_max_rdma_chunked_send_tokens / 4);
       if (min_head != std::numeric_limits<int>::max() and min_head >= last_head + head_update_threshold and
@@ -1408,12 +1389,11 @@ __global__ void __launch_bounds__(((kNumDispatchRDMASenderWarps + 1 + NUM_MAX_NV
           // is `peer.rdma_channel_head.buffer(my_rdma_rank)`). Bypasses
           // both broken port_channel.put and the unreliable NVLS counter.
           const int dst_rank_global = lane_id * NUM_MAX_NVL_PEERS + nvl_rank;
-          const uintptr_t my_head_off =
-              reinterpret_cast<uintptr_t>(rdma_channel_head.buffer(rdma_rank)) -
-              reinterpret_cast<uintptr_t>(rdma_buffer_ptr_base);
-          uint64_t* peer_head = reinterpret_cast<uint64_t*>(
-              reinterpret_cast<uint8_t*>(peer_rdma_bases[dst_rank_global]) + my_head_off);
-          asm volatile("st.release.sys.global.u64 [%0], %1;" :: "l"(peer_head), "l"((uint64_t)min_head) : "memory");
+          const uintptr_t my_head_off = reinterpret_cast<uintptr_t>(rdma_channel_head.buffer(rdma_rank)) -
+                                        reinterpret_cast<uintptr_t>(rdma_buffer_ptr_base);
+          uint64_t* peer_head =
+              reinterpret_cast<uint64_t*>(reinterpret_cast<uint8_t*>(peer_rdma_bases[dst_rank_global]) + my_head_off);
+          asm volatile("st.release.sys.global.u64 [%0], %1;" ::"l"(peer_head), "l"((uint64_t)min_head) : "memory");
         } else if (lane_id == rdma_rank) {
           // Self-loop: plain release atomic on local slot. Cannot use NVLS
           // multimem here \u2014 it fans out to all NVL peers' local buffers
@@ -1428,9 +1408,8 @@ __global__ void __launch_bounds__(((kNumDispatchRDMASenderWarps + 1 + NUM_MAX_NV
           // Absolute-value RDMA WRITE replaces broken HW atomicAdd (see note above).
           *rdma_channel_head_send_src.buffer(lane_id) = (uint64_t)min_head;
           __threadfence_system();
-          const auto src_off_head =
-              reinterpret_cast<uintptr_t>(rdma_channel_head_send_src.buffer(lane_id)) -
-              reinterpret_cast<uintptr_t>(rdma_buffer_ptr_base);
+          const auto src_off_head = reinterpret_cast<uintptr_t>(rdma_channel_head_send_src.buffer(lane_id)) -
+                                    reinterpret_cast<uintptr_t>(rdma_buffer_ptr_base);
           handle.put(dst_offset, src_off_head, sizeof(uint64_t));
         }
         last_head = min_head;
@@ -1548,9 +1527,8 @@ void dispatch(void* recv_x, float* recv_x_scales, int64_t* recv_topk_idx, float*
               int num_max_nvl_chunked_send_tokens, int num_max_nvl_chunked_recv_tokens, int rank, int num_ranks,
               bool is_cached_dispatch, cudaStream_t stream, int num_channels, bool low_latency_mode,
               mscclpp::PortChannelDeviceHandle* port_channel_handles,
-              mscclpp::MemoryChannelDeviceHandle* memory_channel_handles,
-              void* nvls_head_mc, void* nvls_head_dev, void* nvls_tail_mc, void* nvls_tail_dev,
-              void* const* peer_rdma_bases) {
+              mscclpp::MemoryChannelDeviceHandle* memory_channel_handles, void* nvls_head_mc, void* nvls_head_dev,
+              void* nvls_tail_mc, void* nvls_tail_dev, void* const* peer_rdma_bases) {
   constexpr int kNumDispatchRDMASenderWarps = 7;
 
 #define DISPATCH_LAUNCH_CASE(num_rdma_ranks)                                                                           \
@@ -1591,8 +1569,7 @@ __global__ void cached_notify(const int rdma_clean_offset, const int rdma_num_in
                               // Phase 4: NVLS multimem barrier path — bypasses port_channel signal/wait
                               // (broken on Azure CX-7 RoCE) by using multimem.red.add + ld.acquire on a
                               // pair of barrier slots (offsets +24 and +32). Both nullptr ⇒ fall back to IB.
-                              void* nvls_mc_ptr, void* nvls_dev_ptr, size_t nvls_off_barrier,
-                              uint64_t nvls_epoch) {
+                              void* nvls_mc_ptr, void* nvls_dev_ptr, size_t nvls_off_barrier, uint64_t nvls_epoch) {
   auto sm_id = static_cast<int>(blockIdx.x);
   auto thread_id = static_cast<int>(threadIdx.x);
   auto num_threads = static_cast<int>(blockDim.x);
@@ -1746,8 +1723,8 @@ void cached_notify(int hidden_int4, int num_scales, int num_topk_idx, int num_to
                    int num_max_nvl_chunked_recv_tokens, int** task_fifo_ptrs, int head, int rank, cudaStream_t stream,
                    int64_t num_rdma_bytes, int64_t num_nvl_bytes, bool is_cached_dispatch, bool low_latency_mode,
                    mscclpp::PortChannelDeviceHandle* port_channel_handles,
-                   mscclpp::MemoryChannelDeviceHandle* memory_channel_handles,
-                   void* nvls_mc_ptr, void* nvls_dev_ptr, size_t nvls_off_barrier, uint64_t nvls_epoch) {
+                   mscclpp::MemoryChannelDeviceHandle* memory_channel_handles, void* nvls_mc_ptr, void* nvls_dev_ptr,
+                   size_t nvls_off_barrier, uint64_t nvls_epoch) {
   const int num_threads = std::max(128, 32 * num_channels);
   const auto num_rdma_ranks = num_ranks / NUM_MAX_NVL_PEERS;
 
@@ -1775,8 +1752,8 @@ void cached_notify(int hidden_int4, int num_scales, int num_topk_idx, int num_to
   LAUNCH_KERNEL(&cfg, cached_notify_func, rdma_clean_meta.first, rdma_clean_meta.second, nvl_clean_meta.first,
                 nvl_clean_meta.second, combined_rdma_head, num_combined_tokens, num_channels,
                 rdma_channel_prefix_matrix, rdma_rank_prefix_sum, combined_nvl_head, rdma_buffer_ptr, buffer_ptrs,
-                task_fifo_ptrs, head, rank, num_ranks, is_cached_dispatch, port_channel_handles,
-                memory_channel_handles, nvls_mc_ptr, nvls_dev_ptr, nvls_off_barrier, nvls_epoch);
+                task_fifo_ptrs, head, rank, num_ranks, is_cached_dispatch, port_channel_handles, memory_channel_handles,
+                nvls_mc_ptr, nvls_dev_ptr, nvls_off_barrier, nvls_epoch);
 }
 
 template <int kNumRanks, typename dtype_t, int kMaxNumRanks, typename ReceiveFn, typename ReceiveTWFn>
@@ -2118,8 +2095,8 @@ __global__ void __launch_bounds__((NUM_MAX_NVL_PEERS + 1 + kNumForwarders) * 32,
           // Phase 4: head read — cross-node head feedback comes from peer
           // via fabric-VA store, self-loop head from local atomic. Both end
           // up in rdma_channel_head; one read path covers them.
-          int num_used_slots = token_start_idx -
-                               static_cast<int>(ld_volatile_global(rdma_channel_head.buffer(dst_rdma_rank)));
+          int num_used_slots =
+              token_start_idx - static_cast<int>(ld_volatile_global(rdma_channel_head.buffer(dst_rdma_rank)));
           if (num_max_rdma_chunked_recv_tokens - num_used_slots >= num_chunked_tokens) break;
 
           // Timeout check
@@ -2188,20 +2165,17 @@ __global__ void __launch_bounds__((NUM_MAX_NVL_PEERS + 1 + kNumForwarders) * 32,
             // is available; otherwise single-lane handle.put + flush.
             if (dst_rdma_rank != rdma_rank) {
               const auto rdma_slot_idx = token_start_idx % num_max_rdma_chunked_recv_tokens;
-              const size_t num_bytes_per_msg =
-                  (size_t)num_chunked_tokens * (size_t)num_bytes_per_rdma_token;
-              const auto dst_offset =
-                  rdma_rank * (num_max_rdma_chunked_recv_tokens * num_bytes_per_rdma_token) +
-                  rdma_slot_idx * num_bytes_per_rdma_token + data_recv_offset;
-              const auto src_offset =
-                  dst_rdma_rank * (num_max_rdma_chunked_recv_tokens * num_bytes_per_rdma_token) +
-                  rdma_slot_idx * num_bytes_per_rdma_token + data_send_offset;
+              const size_t num_bytes_per_msg = (size_t)num_chunked_tokens * (size_t)num_bytes_per_rdma_token;
+              const auto dst_offset = rdma_rank * (num_max_rdma_chunked_recv_tokens * num_bytes_per_rdma_token) +
+                                      rdma_slot_idx * num_bytes_per_rdma_token + data_recv_offset;
+              const auto src_offset = dst_rdma_rank * (num_max_rdma_chunked_recv_tokens * num_bytes_per_rdma_token) +
+                                      rdma_slot_idx * num_bytes_per_rdma_token + data_send_offset;
               if (peer_rdma_bases != nullptr) {
                 const int dst_rank_global = dst_rdma_rank * NUM_MAX_NVL_PEERS + nvl_rank;
-                const int4* src_p = reinterpret_cast<const int4*>(
-                    reinterpret_cast<uint8_t*>(rdma_buffer_ptr_base) + src_offset);
-                int4* dst_p = reinterpret_cast<int4*>(
-                    reinterpret_cast<uint8_t*>(peer_rdma_bases[dst_rank_global]) + dst_offset);
+                const int4* src_p =
+                    reinterpret_cast<const int4*>(reinterpret_cast<uint8_t*>(rdma_buffer_ptr_base) + src_offset);
+                int4* dst_p =
+                    reinterpret_cast<int4*>(reinterpret_cast<uint8_t*>(peer_rdma_bases[dst_rank_global]) + dst_offset);
                 const int n_int4 = (int)(num_bytes_per_msg / sizeof(int4));
                 // Same 8x unroll as dispatch sender for LSU pipelining.
                 const int stride8 = 8 * 32;
@@ -2246,18 +2220,16 @@ __global__ void __launch_bounds__((NUM_MAX_NVL_PEERS + 1 + kNumForwarders) * 32,
             if (dst_rdma_rank != rdma_rank) {
               if (peer_rdma_bases != nullptr && lane_id == 0) {
                 const int dst_rank_global = dst_rdma_rank * NUM_MAX_NVL_PEERS + nvl_rank;
-                const uintptr_t my_tail_off =
-                    reinterpret_cast<uintptr_t>(rdma_channel_tail.buffer(rdma_rank)) -
-                    reinterpret_cast<uintptr_t>(rdma_buffer_ptr_base);
+                const uintptr_t my_tail_off = reinterpret_cast<uintptr_t>(rdma_channel_tail.buffer(rdma_rank)) -
+                                              reinterpret_cast<uintptr_t>(rdma_buffer_ptr_base);
                 uint64_t* peer_tail = reinterpret_cast<uint64_t*>(
                     reinterpret_cast<uint8_t*>(peer_rdma_bases[dst_rank_global]) + my_tail_off);
                 const uint64_t new_tail = (uint64_t)(token_start_idx + num_chunked_tokens);
-                asm volatile("st.release.sys.global.u64 [%0], %1;" :: "l"(peer_tail), "l"(new_tail) : "memory");
+                asm volatile("st.release.sys.global.u64 [%0], %1;" ::"l"(peer_tail), "l"(new_tail) : "memory");
               }
             } else if (lane_id == 0) {
-              mscclpp::atomicFetchAdd(
-                  reinterpret_cast<uint64_t*>(rdma_channel_tail.buffer(rdma_rank)),
-                  (uint64_t)num_chunked_tokens, mscclpp::memoryOrderRelease);
+              mscclpp::atomicFetchAdd(reinterpret_cast<uint64_t*>(rdma_channel_tail.buffer(rdma_rank)),
+                                      (uint64_t)num_chunked_tokens, mscclpp::memoryOrderRelease);
             }
           } else if (lane_id == 0) {
             if (dst_rdma_rank == rdma_rank) {
@@ -2280,9 +2252,8 @@ __global__ void __launch_bounds__((NUM_MAX_NVL_PEERS + 1 + kNumForwarders) * 32,
               const uint64_t new_tail = (uint64_t)(token_start_idx + num_chunked_tokens);
               *rdma_channel_tail_send_src.buffer(dst_rdma_rank) = new_tail;
               __threadfence_system();
-              const auto src_off_tail =
-                  reinterpret_cast<uintptr_t>(rdma_channel_tail_send_src.buffer(dst_rdma_rank)) -
-                  reinterpret_cast<uintptr_t>(rdma_buffer_ptr_base);
+              const auto src_off_tail = reinterpret_cast<uintptr_t>(rdma_channel_tail_send_src.buffer(dst_rdma_rank)) -
+                                        reinterpret_cast<uintptr_t>(rdma_buffer_ptr_base);
               handle.put(rdma_rank * sizeof(uint64_t) + tail_send_offset, src_off_tail, sizeof(uint64_t));
             }
           }
@@ -2388,12 +2359,11 @@ __global__ void __launch_bounds__((NUM_MAX_NVL_PEERS + 1 + kNumForwarders) * 32,
             //   self-loop → local atomicAdd
             if (peer_rdma_bases != nullptr && dst_rdma_rank != rdma_rank) {
               const int dst_rank_global = dst_rdma_rank * NUM_MAX_NVL_PEERS + nvl_rank;
-              const uintptr_t my_head_off =
-                  reinterpret_cast<uintptr_t>(rdma_channel_head.buffer(rdma_rank)) -
-                  reinterpret_cast<uintptr_t>(rdma_buffer_ptr_base);
+              const uintptr_t my_head_off = reinterpret_cast<uintptr_t>(rdma_channel_head.buffer(rdma_rank)) -
+                                            reinterpret_cast<uintptr_t>(rdma_buffer_ptr_base);
               uint64_t* peer_head = reinterpret_cast<uint64_t*>(
                   reinterpret_cast<uint8_t*>(peer_rdma_bases[dst_rank_global]) + my_head_off);
-              asm volatile("st.release.sys.global.u64 [%0], %1;" :: "l"(peer_head), "l"((uint64_t)min_head) : "memory");
+              asm volatile("st.release.sys.global.u64 [%0], %1;" ::"l"(peer_head), "l"((uint64_t)min_head) : "memory");
             } else if (dst_rdma_rank == rdma_rank) {
               mscclpp::atomicFetchAdd(static_cast<uint64_t*>(rdma_channel_head.buffer(rdma_rank)),
                                       (uint64_t)(min_head - last_rdma_head), mscclpp::memoryOrderRelease);
@@ -2406,9 +2376,8 @@ __global__ void __launch_bounds__((NUM_MAX_NVL_PEERS + 1 + kNumForwarders) * 32,
               // Absolute-value RDMA WRITE replaces broken HW atomicAdd.
               *rdma_channel_head_send_src.buffer(dst_rdma_rank) = (uint64_t)min_head;
               __threadfence_system();
-              const auto src_off_head =
-                  reinterpret_cast<uintptr_t>(rdma_channel_head_send_src.buffer(dst_rdma_rank)) -
-                  reinterpret_cast<uintptr_t>(rdma_buffer_ptr_base);
+              const auto src_off_head = reinterpret_cast<uintptr_t>(rdma_channel_head_send_src.buffer(dst_rdma_rank)) -
+                                        reinterpret_cast<uintptr_t>(rdma_buffer_ptr_base);
               handle.put(dst_offset, src_off_head, sizeof(uint64_t));
             }
             last_rdma_head = min_head;
@@ -2443,9 +2412,8 @@ void combine(cudaDataType_t type, void* combined_x, float* combined_topk_weights
              void** buffer_ptrs, int num_max_nvl_chunked_send_tokens, int num_max_nvl_chunked_recv_tokens, int rank,
              int num_ranks, cudaStream_t stream, int num_channels, bool low_latency_mode,
              mscclpp::PortChannelDeviceHandle* port_channel_handles,
-             mscclpp::MemoryChannelDeviceHandle* memory_channel_handles,
-             void* nvls_head_mc, void* nvls_head_dev, void* nvls_tail_mc, void* nvls_tail_dev,
-             void* const* peer_rdma_bases) {
+             mscclpp::MemoryChannelDeviceHandle* memory_channel_handles, void* nvls_head_mc, void* nvls_head_dev,
+             void* nvls_tail_mc, void* nvls_tail_dev, void* const* peer_rdma_bases) {
   constexpr int kNumCombineForwarderWarps = 16;
 
 #define COMBINE_LAUNCH_CASE(num_rdma_ranks)                                                                           \
@@ -2458,8 +2426,8 @@ void combine(cudaDataType_t type, void* combined_x, float* combined_topk_weights
                   rdma_rank_prefix_sum, gbl_channel_prefix_matrix, num_tokens, num_combined_tokens, hidden, num_topk, \
                   rdma_buffer_ptr, num_max_rdma_chunked_send_tokens, num_max_rdma_chunked_recv_tokens, buffer_ptrs,   \
                   num_max_nvl_chunked_send_tokens, num_max_nvl_chunked_recv_tokens, rank, num_ranks,                  \
-                  port_channel_handles, memory_channel_handles,                                                       \
-                  nvls_head_mc, nvls_head_dev, nvls_tail_mc, nvls_tail_dev, peer_rdma_bases);                         \
+                  port_channel_handles, memory_channel_handles, nvls_head_mc, nvls_head_dev, nvls_tail_mc,            \
+                  nvls_tail_dev, peer_rdma_bases);                                                                    \
   }                                                                                                                   \
   break
 
diff --git a/src/ext/ep/kernels/internode_ll.cu b/src/ext/ep/kernels/internode_ll.cu
index e4c3781c..827e2ee8 100644
--- a/src/ext/ep/kernels/internode_ll.cu
+++ b/src/ext/ep/kernels/internode_ll.cu
@@ -431,16 +431,13 @@ void dispatch(void* packed_recv_x, float* packed_recv_x_scales, int* packed_recv
     cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, cur_dev);
     int max_active_narrow = 0;
     cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-        &max_active_narrow,
-        (void*)dispatch<false, true, kNumWarpGroupsIpc, kNumWarpsPerGroupIpc, 7168>,
+        &max_active_narrow, (void*)dispatch<false, true, kNumWarpGroupsIpc, kNumWarpsPerGroupIpc, 7168>,
         kNumWarpGroupsIpc * kNumWarpsPerGroupIpc * 32, 0);
     const int cap_narrow = max_active_narrow * sm_count;
     ipc_wide = (num_experts > cap_narrow);
   }
-  const int kNumWarpGroups =
-      (use_ipc_path && !ipc_wide) ? kNumWarpGroupsIpc : kNumWarpGroupsRdma;
-  const int kNumWarpsPerGroup =
-      (use_ipc_path && !ipc_wide) ? kNumWarpsPerGroupIpc : kNumWarpsPerGroupRdma;
+  const int kNumWarpGroups = (use_ipc_path && !ipc_wide) ? kNumWarpGroupsIpc : kNumWarpGroupsRdma;
+  const int kNumWarpsPerGroup = (use_ipc_path && !ipc_wide) ? kNumWarpsPerGroupIpc : kNumWarpsPerGroupRdma;
   const auto num_warps = kNumWarpGroups * kNumWarpsPerGroup;
   const auto num_sms_base = cell_div(num_experts, kNumWarpGroups);
   // LL dispatch/combine are latency-bound at typical problem sizes: for
@@ -688,16 +685,13 @@ void combine(void* combined_x, void* rdma_recv_x, int64_t* rdma_recv_flag, void*
     cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, cur_dev);
     int max_active_narrow = 0;
     cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-        &max_active_narrow,
-        (void*)combine<true, kNumWarpGroupsIpc, kNumWarpsPerGroupIpc, 7168, kNumMaxTopk>,
+        &max_active_narrow, (void*)combine<true, kNumWarpGroupsIpc, kNumWarpsPerGroupIpc, 7168, kNumMaxTopk>,
         kNumWarpGroupsIpc * kNumWarpsPerGroupIpc * 32, 0);
     const int cap_narrow = max_active_narrow * sm_count;
     ipc_wide = (num_experts > cap_narrow);
   }
-  const int kNumWarpGroups =
-      (use_ipc_path && !ipc_wide) ? kNumWarpGroupsIpc : kNumWarpGroupsRdma;
-  const int kNumWarpsPerGroup =
-      (use_ipc_path && !ipc_wide) ? kNumWarpsPerGroupIpc : kNumWarpsPerGroupRdma;
+  const int kNumWarpGroups = (use_ipc_path && !ipc_wide) ? kNumWarpGroupsIpc : kNumWarpGroupsRdma;
+  const int kNumWarpsPerGroup = (use_ipc_path && !ipc_wide) ? kNumWarpsPerGroupIpc : kNumWarpsPerGroupRdma;
   const auto num_warps = kNumWarpGroups * kNumWarpsPerGroup;
   const auto num_sms_base = cell_div(num_experts, kNumWarpGroups);
   // See the comment in `dispatch()` above: combine-recv's per-token loop
diff --git a/src/ext/ep/kernels/utils.cuh b/src/ext/ep/kernels/utils.cuh
index fadd2488..ec84a507 100644
--- a/src/ext/ep/kernels/utils.cuh
+++ b/src/ext/ep/kernels/utils.cuh
@@ -2,10 +2,10 @@
 // Licensed under the MIT License.
 #pragma once
 
+#include <cstdint>
 #include <type_traits>
 
 #include "exception.cuh"
-#include <cstdint>
 
 #define UNROLLED_WARP_COPY(UNROLL_FACTOR, LANE_ID, N, DST, SRC, LD_FUNC, ST_FUNC)                        \
   {                                                                                                      \
diff --git a/test/python/ext/ep/test_internode_multirank.py b/test/python/ext/ep/test_internode_multirank.py
index b7ae2e8d..8fec792b 100644
--- a/test/python/ext/ep/test_internode_multirank.py
+++ b/test/python/ext/ep/test_internode_multirank.py
@@ -96,6 +96,7 @@ def main():
 
     # Small settings for functional check
     import os as _os
+
     num_tokens = int(_os.environ.get("MSCCLPP_EP_HT_TOKENS", "128"))
     hidden = int(_os.environ.get("MSCCLPP_EP_HT_HIDDEN", "1024"))
     num_topk = int(_os.environ.get("MSCCLPP_EP_HT_TOPK", str(min(4, num_ranks))))
@@ -140,7 +141,13 @@ def main():
 
     # Buffer config for internode HT: needs num_rdma_bytes > 0. Size buffers
     # using max(hidden, bench_hidden) so the optional bench phase fits.
-    cfg = ep.Config(int(os.environ.get("MSCCLPP_EP_NSM","152")), int(os.environ.get("MSCCLPP_EP_NVL_SEND","8")), int(os.environ.get("MSCCLPP_EP_NVL_RECV","256")), int(os.environ.get("MSCCLPP_EP_RDMA_SEND","16")), int(os.environ.get("MSCCLPP_EP_RDMA_RECV","128")))
+    cfg = ep.Config(
+        int(os.environ.get("MSCCLPP_EP_NSM", "152")),
+        int(os.environ.get("MSCCLPP_EP_NVL_SEND", "8")),
+        int(os.environ.get("MSCCLPP_EP_NVL_RECV", "256")),
+        int(os.environ.get("MSCCLPP_EP_RDMA_SEND", "16")),
+        int(os.environ.get("MSCCLPP_EP_RDMA_RECV", "128")),
+    )
     _bench_on = os.environ.get("MSCCLPP_EP_BENCH", "0") == "1"
     _buf_hidden = max(hidden, int(os.environ.get("MSCCLPP_EP_BENCH_HIDDEN", "0"))) if _bench_on else hidden
     num_nvl_bytes = cfg.get_nvl_buffer_size_hint(_buf_hidden * x.element_size(), num_ranks)
diff --git a/test/python/ext/ep/test_intranode_multirank.py b/test/python/ext/ep/test_intranode_multirank.py
index c6ac77f8..5bb9a929 100644
--- a/test/python/ext/ep/test_intranode_multirank.py
+++ b/test/python/ext/ep/test_intranode_multirank.py
@@ -106,9 +106,11 @@ def main():
 
     # Allocate Buffer (intranode only: num_rdma_bytes=0). Size the NVL buffer
     # using max(hidden, bench_hidden) so the optional bench phase fits.
-    cfg = ep.Config(int(os.environ.get("MSCCLPP_EP_NUM_SMS", "20")),
-                    int(os.environ.get("MSCCLPP_EP_NVL_SEND", "8")),
-                    int(os.environ.get("MSCCLPP_EP_NVL_RECV", "256")))
+    cfg = ep.Config(
+        int(os.environ.get("MSCCLPP_EP_NUM_SMS", "20")),
+        int(os.environ.get("MSCCLPP_EP_NVL_SEND", "8")),
+        int(os.environ.get("MSCCLPP_EP_NVL_RECV", "256")),
+    )
     _bench_on = os.environ.get("MSCCLPP_EP_BENCH", "0") == "1"
     _buf_hidden = max(hidden, int(os.environ.get("MSCCLPP_EP_BENCH_HIDDEN", "0"))) if _bench_on else hidden
     num_nvl_bytes = cfg.get_nvl_buffer_size_hint(_buf_hidden * x.element_size(), num_ranks)
@@ -304,8 +306,8 @@ def main():
     # This matches NCCL-EP's `ep_bench` convention and isolates the on-GPU
     # dispatch kernel cost from one-time setup overhead.
     _layout = _dispatch()
-    _cached_rpm = _layout[5]   # rank_prefix_matrix
-    _cached_cpm = _layout[6]   # channel_prefix_matrix
+    _cached_rpm = _layout[5]  # rank_prefix_matrix
+    _cached_cpm = _layout[6]  # channel_prefix_matrix
     _cached_n = int(_layout[0].size(0))  # num_recv_tokens on this rank
 
     def _dispatch_cached():