From e874bf16663a30cd50c168d244211a4ab9c5b97c Mon Sep 17 00:00:00 2001 From: Copilot <198982749+Copilot@users.noreply.github.com> Date: Wed, 22 Apr 2026 10:12:40 -0700 Subject: [PATCH 01/12] fix: isCuMemMapAllocated crashes on non-NVLS systems even with MSCCLPP_FORCE_DISABLE_NVLS=true (#790) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - [x] Fix `isCuMemMapAllocated()` to just return `true/false` without throwing when NVLS is not supported - [x] Fix `isNvlsSupported()` caching bug where `result`/`isChecked` were never updated - [x] Restore `[[maybe_unused]]` on `result` and `isChecked` statics — needed in HIP/ROCm env where `CUDA_NVLS_API_AVAILABLE` is not defined and the variables would otherwise be unused - [x] Run linter (`./tools/lint.sh`) --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: Binyang2014 <9415966+Binyang2014@users.noreply.github.com> --- src/core/gpu_utils.cc | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/core/gpu_utils.cc b/src/core/gpu_utils.cc index 628d2dcb..09d5025d 100644 --- a/src/core/gpu_utils.cc +++ b/src/core/gpu_utils.cc @@ -283,7 +283,9 @@ bool isNvlsSupported() { MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId)); MSCCLPP_CUTHROW(cuDeviceGet(&dev, deviceId)); MSCCLPP_CUTHROW(cuDeviceGetAttribute(&isMulticastSupported, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, dev)); - return isMulticastSupported == 1; + result = (isMulticastSupported == 1); + isChecked = true; + return result; } return result; #endif @@ -300,9 +302,6 @@ bool isCuMemMapAllocated([[maybe_unused]] void* ptr) { return false; } MSCCLPP_CUTHROW(cuMemRelease(handle)); - if (!isNvlsSupported()) { - throw Error("cuMemMap is used in env without NVLS support", ErrorCode::InvalidUsage); - } return true; #endif } From c97be492d5d097f2ab5885e74b029610845400a4 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Mon, 27 Apr 2026 10:32:20 -0700 Subject: [PATCH 02/12] GDRCopy status message to string (#793) --- src/core/gdr.cc | 4 ++-- src/core/include/gdr.hpp | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/core/gdr.cc b/src/core/gdr.cc index 22ac15c9..f361a3aa 100644 --- a/src/core/gdr.cc +++ b/src/core/gdr.cc @@ -48,7 +48,7 @@ GdrStatus gdrStatus() { return gdrContext()->status(); } bool gdrEnabled() { return gdrStatus() == GdrStatus::Ok; } -const char* gdrStatusMessage() { +std::string gdrStatusMessage() { switch (gdrStatus()) { case GdrStatus::Ok: return "GDRCopy initialized successfully"; @@ -181,7 +181,7 @@ GdrStatus gdrStatus() { return GdrStatus::NotBuilt; } bool gdrEnabled() { return false; } -const char* gdrStatusMessage() { return "mscclpp was not built with GDRCopy support (MSCCLPP_USE_GDRCOPY not set)"; } +std::string gdrStatusMessage() { return "mscclpp was not built with GDRCopy support (MSCCLPP_USE_GDRCOPY not set)"; } // GdrMap::Impl — stub (no GDRCopy) diff --git a/src/core/include/gdr.hpp b/src/core/include/gdr.hpp index e0c7f006..c1378334 100644 --- a/src/core/include/gdr.hpp +++ b/src/core/include/gdr.hpp @@ -7,6 +7,7 @@ #include #include #include +#include namespace mscclpp { @@ -25,7 +26,7 @@ GdrStatus gdrStatus(); bool gdrEnabled(); /// Return a human-readable error message for the current GDRCopy status. -const char* gdrStatusMessage(); +std::string gdrStatusMessage(); /// RAII wrapper for a GDRCopy BAR1 mapping of a GPU address. /// When GDRCopy is not available, all operations are no-ops and valid() returns false. From 2c52937b26e6b72846cb8bec2f7479fb90162913 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Tue, 28 Apr 2026 15:02:22 -0700 Subject: [PATCH 03/12] Fix FP8 ROCm build/test issues and dtype naming (#792) ## Summary - Fix ROCm FP8 build failure by using the actual FP8 `DataType` enum constants in allreduce packet tuning. - Fix FP8 E4M3FNUZ test encoding so small negative values do not produce the FNUZ NaN byte (`0x80`). - Align FP8 `DataType` enum constants and Python bindings with torch-style names (`FLOAT8_E4M3FN`, `FLOAT8_E4M3FNUZ`, `FLOAT8_E5M2FNUZ` / `float8_e4m3fn`, `float8_e4m3fnuz`, `float8_e5m2fnuz`). ## Validation - `./tools/lint.sh` - `make -j` from `build/` - `mpirun --allow-run-as-root -np 8 python3 -m pytest python/test/test_fp8_accum.py -q` (`36 passed, 9 skipped`) - `DTYPE=float8_e4m3fnuz ACCUM_DTYPE=float32 torchrun --nnodes=1 --nproc_per_node=8 examples/torch-integration/customized_comm_with_tuning.py` --------- Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- include/mscclpp/gpu_data_types.hpp | 104 ++++-------- python/csrc/core_py.cpp | 6 +- python/mscclpp/utils.py | 12 +- python/test/test_fp8_accum.py | 152 +++++++++++++----- src/core/algorithm.cc | 35 +++- src/core/executor/execution_kernel.cu | 4 +- src/core/executor/executor.cc | 4 +- src/core/include/execution_kernel.hpp | 27 +++- .../collectives/allreduce/allreduce_packet.cu | 5 +- .../collectives/include/allreduce/common.hpp | 14 +- src/ext/nccl/algorithm_selector.cc | 3 +- src/ext/nccl/datatype_conversion.hpp | 43 ++++- 12 files changed, 271 insertions(+), 138 deletions(-) diff --git a/include/mscclpp/gpu_data_types.hpp b/include/mscclpp/gpu_data_types.hpp index 41bd5928..672434f9 100644 --- a/include/mscclpp/gpu_data_types.hpp +++ b/include/mscclpp/gpu_data_types.hpp @@ -21,7 +21,10 @@ using __bfloat162 = __hip_bfloat162; #if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR >= 6) #include -// Create aliases matching CUDA naming convention for cross-platform compatibility +// Create aliases matching CUDA naming convention for cross-platform compatibility. +// Define __FP8_E4M3_IS_FNUZ__ / __FP8_E5M2_IS_FNUZ__ when the platform-native FP8 is the +// "fnuz" variant (no infinities, NaN-only at 0x80, bias differs from OCP). Dispatch layers +// use these macros to throw on unsupported variants requested via DataType. #if (HIP_VERSION_MAJOR == 6) || (HIP_VERSION_MAJOR > 6 && HIP_FP8_TYPE_FNUZ && !HIP_FP8_TYPE_OCP) using __fp8_e4m3 = __hip_fp8_e4m3_fnuz; using __fp8_e5m2 = __hip_fp8_e5m2_fnuz; @@ -29,6 +32,8 @@ using __fp8x2_e4m3 = __hip_fp8x2_e4m3_fnuz; using __fp8x2_e5m2 = __hip_fp8x2_e5m2_fnuz; using __fp8x4_e4m3 = __hip_fp8x4_e4m3_fnuz; using __fp8x4_e5m2 = __hip_fp8x4_e5m2_fnuz; +#define __FP8_E4M3_IS_FNUZ__ +#define __FP8_E5M2_IS_FNUZ__ #else using __fp8_e4m3 = __hip_fp8_e4m3; using __fp8_e5m2 = __hip_fp8_e5m2; @@ -66,8 +71,8 @@ using __bfloat162 = __nv_bfloat162; /// Software float8 with 4 exponent bits, 3 mantissa bits, exponent bias = 15. /// Format (MSB first): [sign:1][exponent:4][mantissa:3] -/// No infinities; exp=15 is NaN. Negative zero is NaN (fnuz convention). -/// Max finite value: 0.9375, min normal: ~6.1e-5, min subnormal: ~7.6e-6. +/// No infinities, no NaN. Encode saturates to ±1.75 (0x7e/0xfe). +/// Adapted from the Triton compiler's fp8e4b15 format. struct alignas(1) __fp8_e4m3b15 { uint8_t __x; @@ -97,35 +102,15 @@ struct alignas(1) __fp8_e4m3b15 { /// Algorithm: reinterpret fp8 bits into an fp16 bit pattern with exponent shifted by -8, /// then convert fp16 → float32. static MSCCLPP_HOST_DEVICE_INLINE float toFloat(uint8_t bits) { - // Handle special values: negative zero (0x80) → NaN, exponent=15 → NaN. - uint32_t exp = (bits >> 3) & 0xFu; - if (bits == 0x80 || exp == 15) { - union { - uint32_t u; - float f; - } nan_val = {0x7FC00000u}; - return nan_val.f; - } - if (bits == 0) return 0.0f; - - // Triton-style bit manipulation: fp8 → fp16 → fp32. - // fp8 layout: [S:1][E:4][M:3] (bias=15) - // fp16 layout: [S:1][E:5][M:10] (bias=15) - // - // Place fp8 in upper byte of fp16, then right-shift exponent+mantissa by 1 - // to convert E4 → E5 (both share bias=15). Sign bit stays at bit 15. + // Branch-free decode: fp8 → fp16 → fp32, no special-case handling. + // Encode saturates to ±1.75, so 0x7f/0xff are never produced. // Refer: // https://github.com/triton-lang/triton/blob/cf34004b8a67d290a962da166f5aa2fc66751326/python/triton/language/extra/cuda/utils.py#L34 uint16_t h = (uint16_t)bits << 8; // place fp8 in upper byte of fp16 uint16_t sign16 = h & 0x8000u; // extract sign at fp16 position uint16_t nosign = h & 0x7F00u; // exponent + mantissa (no sign) - uint16_t fp16_bits = sign16 | (nosign >> 1); // shift exponent right by 1 + uint16_t fp16_bits = sign16 | (nosign >> 1); // shift exponent right by 1 (E4→E5) - // For subnormals: when fp8 exponent=0, the above gives fp16 exponent=0 - // and fp16 mantissa = (fp8_mantissa << 7), which correctly represents - // the subnormal fp16 value since both share bias=15. - - // Convert fp16 bits to float via __half (works on host and device, CUDA and HIP). union { uint16_t u; __half h; @@ -139,14 +124,6 @@ struct alignas(1) __fp8_e4m3b15 { /// The key insight is to convert to fp16 first (which shares bias=15 with e4m3b15), /// then pack the fp16 bits back into 8 bits by shifting the exponent left by 1. static MSCCLPP_HOST_DEVICE_INLINE uint8_t fromFloat(float val) { - union { - float f; - uint32_t u; - } in = {val}; - - // NaN → 0x80 (negative-zero bit pattern = NaN in fnuz). - if ((in.u & 0x7F800000u) == 0x7F800000u && (in.u & 0x007FFFFFu) != 0) return 0x80u; - // Convert float32 → fp16 bits via __half (works on host and device, CUDA and HIP). __half h_val = __float2half_rn(val); union { @@ -155,32 +132,19 @@ struct alignas(1) __fp8_e4m3b15 { } cvt = {h_val}; uint16_t fp16_bits = cvt.u; - // Clamp absolute value to max finite e4m3b15: 0.9375 → fp16 = 0x3B80. + // Clamp abs to max encodable value: 1.75 → fp16 = 0x3F00. + // Matches Triton: encode saturates, 0x7f/0xff are never produced. uint16_t abs_fp16 = fp16_bits & 0x7FFFu; - if (abs_fp16 > 0x3B80u) abs_fp16 = 0x3B80u; + if (abs_fp16 > 0x3F00u) abs_fp16 = 0x3F00u; // Reconstruct with sign. uint16_t sign16 = fp16_bits & 0x8000u; - // Triton-style: fp16 → fp8. - // fp16 layout: [S:1][E:5][M:10] (bias=15) - // fp8 layout: [S:1][E:4][M:3] (bias=15) - // - // mad.lo.u32 a0, a0, 2, 0x00800080 → (abs_fp16 * 2 + 0x0080) - // This shifts left by 1 (undoing the right-shift in decode) and adds rounding bias. - // Then: lop3.b32 b0, $1, 0x80008000, a0, 0xea → (sign & 0x8000) | a0 - // Finally: prmt for byte extraction. - // - // Simplified for scalar: shift abs_fp16 left by 1, add rounding bias, take upper byte. + // fp16 → fp8: shift abs left by 1 (undo decode's right-shift), add rounding bias, take upper byte. uint16_t adjusted = (uint16_t)(abs_fp16 * 2u + 0x0080u); - // The upper byte now contains [E:4][M:3][round_bit]. - // Combine with sign and extract. uint16_t with_sign = sign16 | adjusted; uint8_t result = (uint8_t)(with_sign >> 8); - // Zero → 0x00 (ensure positive zero, not negative zero which is NaN). - if ((result & 0x7Fu) == 0) result = 0x00u; - return result; } }; @@ -199,16 +163,18 @@ namespace mscclpp { /// Data types supported by mscclpp operations. enum class DataType { - INT32, // 32-bit signed integer. - UINT32, // 32-bit unsigned integer. - FLOAT16, // IEEE 754 half precision. - FLOAT32, // IEEE 754 single precision. - BFLOAT16, // bfloat16 precision. - FLOAT8_E4M3, // float8 with E4M3 layout. - FLOAT8_E5M2, // float8 with E5M2 layout. - UINT8, // 8-bit unsigned integer. - FLOAT8_E4M3B15, // float8 with E4M3 layout, bias=15 (software, no HW accel). - AUTO = 255, // Sentinel: resolve to the input dtype at runtime. + INT32, // 32-bit signed integer. + UINT32, // 32-bit unsigned integer. + FLOAT16, // IEEE 754 half precision. + FLOAT32, // IEEE 754 single precision. + BFLOAT16, // bfloat16 precision. + FLOAT8_E4M3FN, // float8 E4M3, OCP variant (NV; AMD HIP > 6 with OCP enabled). + FLOAT8_E4M3FNUZ, // float8 E4M3, fnuz variant (AMD HIP 6, or HIP > 6 with FNUZ enabled). + FLOAT8_E5M2, // float8 E5M2, OCP variant (NV; AMD HIP > 6 with OCP enabled). + FLOAT8_E5M2FNUZ, // float8 E5M2, fnuz variant (AMD HIP 6, or HIP > 6 with FNUZ enabled). + UINT8, // 8-bit unsigned integer. + FLOAT8_E4M3B15, // float8 with E4M3 layout, bias=15 (software, no HW accel). + AUTO = 255, // Sentinel: resolve to the input dtype at runtime. }; /// Word array. @@ -1137,11 +1103,11 @@ MSCCLPP_DEVICE_INLINE f8_e4m3b15x2 to(const f16x2& v) { #if defined(MSCCLPP_DEVICE_CUDA) uint32_t in0; asm("mov.b32 %0, %1;" : "=r"(in0) : "r"(*reinterpret_cast(&v))); - // Clamp abs to max finite e4m3b15 (0x3B80 = 0.9375 in fp16). + // Clamp abs to max encodable e4m3b15 (0x3F00 = 1.75 in fp16). uint32_t lo = in0 & 0xFFFFu, hi = in0 >> 16; uint32_t alo = lo & 0x7FFFu, ahi = hi & 0x7FFFu; - alo = alo < 0x3B80u ? alo : 0x3B80u; - ahi = ahi < 0x3B80u ? ahi : 0x3B80u; + alo = alo < 0x3F00u ? alo : 0x3F00u; + ahi = ahi < 0x3F00u ? ahi : 0x3F00u; uint32_t a0 = alo | (ahi << 16); a0 = a0 * 2u + 0x00800080u; uint32_t b0 = a0 | (in0 & 0x80008000u); @@ -1152,7 +1118,7 @@ MSCCLPP_DEVICE_INLINE f8_e4m3b15x2 to(const f16x2& v) { uint32_t in0 = v.words[0]; uint32_t abs0 = in0 & 0x7fff7fffu; uint32_t a0; - asm volatile("v_pk_min_u16 %0, %1, %2" : "=v"(a0) : "v"(abs0), "v"(0x3B803B80u)); + asm volatile("v_pk_min_u16 %0, %1, %2" : "=v"(a0) : "v"(abs0), "v"(0x3F003F00u)); a0 = a0 * 2u + 0x00800080u; uint32_t b0 = a0 | (in0 & 0x80008000u); uint16_t packed = (uint16_t)(((b0 >> 8) & 0xFFu) | ((b0 >> 16) & 0xFF00u)); @@ -1175,8 +1141,8 @@ MSCCLPP_DEVICE_INLINE f8_e4m3b15x4 to(const f16x4& v) { asm("mov.b32 %0, %1;" : "=r"(in1) : "r"(v.words[1])); uint32_t abs0 = in0 & 0x7fff7fffu; uint32_t abs1 = in1 & 0x7fff7fffu; - uint32_t a0 = __vminu2(abs0, 0x3B803B80u); - uint32_t a1 = __vminu2(abs1, 0x3B803B80u); + uint32_t a0 = __vminu2(abs0, 0x3F003F00u); + uint32_t a1 = __vminu2(abs1, 0x3F003F00u); a0 = a0 * 2u + 0x00800080u; a1 = a1 * 2u + 0x00800080u; uint32_t b0, b1; @@ -1189,8 +1155,8 @@ MSCCLPP_DEVICE_INLINE f8_e4m3b15x4 to(const f16x4& v) { uint32_t in0 = v.words[0], in1 = v.words[1]; uint32_t abs0 = in0 & 0x7fff7fffu, abs1 = in1 & 0x7fff7fffu; uint32_t a0, a1; - asm volatile("v_pk_min_u16 %0, %1, %2" : "=v"(a0) : "v"(abs0), "v"(0x3B803B80u)); - asm volatile("v_pk_min_u16 %0, %1, %2" : "=v"(a1) : "v"(abs1), "v"(0x3B803B80u)); + asm volatile("v_pk_min_u16 %0, %1, %2" : "=v"(a0) : "v"(abs0), "v"(0x3F003F00u)); + asm volatile("v_pk_min_u16 %0, %1, %2" : "=v"(a1) : "v"(abs1), "v"(0x3F003F00u)); a0 = a0 * 2u + 0x00800080u; a1 = a1 * 2u + 0x00800080u; uint32_t b0 = a0 | (in0 & 0x80008000u); diff --git a/python/csrc/core_py.cpp b/python/csrc/core_py.cpp index b8649564..a94f9863 100644 --- a/python/csrc/core_py.cpp +++ b/python/csrc/core_py.cpp @@ -45,8 +45,10 @@ void register_core(nb::module_& m) { .value("float16", DataType::FLOAT16) .value("float32", DataType::FLOAT32) .value("bfloat16", DataType::BFLOAT16) - .value("float8_e4m3", DataType::FLOAT8_E4M3) + .value("float8_e4m3fn", DataType::FLOAT8_E4M3FN) + .value("float8_e4m3fnuz", DataType::FLOAT8_E4M3FNUZ) .value("float8_e5m2", DataType::FLOAT8_E5M2) + .value("float8_e5m2fnuz", DataType::FLOAT8_E5M2FNUZ) .value("uint8", DataType::UINT8) .value("float8_e4m3b15", DataType::FLOAT8_E4M3B15); @@ -328,4 +330,4 @@ NB_MODULE(_mscclpp, m) { // ext register_algorithm_collection_builder(m); -} \ No newline at end of file +} diff --git a/python/mscclpp/utils.py b/python/mscclpp/utils.py index 93cd786b..0f0a28d4 100644 --- a/python/mscclpp/utils.py +++ b/python/mscclpp/utils.py @@ -192,12 +192,14 @@ def torch_dtype_to_mscclpp_dtype(dtype: "torch.dtype") -> DataType: return DataType.int32 elif dtype == torch.bfloat16: return DataType.bfloat16 - # Hardware supports either OCP format or FNUZ format for float8. - # Mapping both to the same MSCClPP data type. - elif dtype == torch.float8_e5m2 or dtype == torch.float8_e5m2fnuz: + elif dtype == torch.float8_e5m2: return DataType.float8_e5m2 - elif dtype == torch.float8_e4m3fn or dtype == torch.float8_e4m3fnuz: - return DataType.float8_e4m3 + elif dtype == torch.float8_e5m2fnuz: + return DataType.float8_e5m2fnuz + elif dtype == torch.float8_e4m3fn: + return DataType.float8_e4m3fn + elif dtype == torch.float8_e4m3fnuz: + return DataType.float8_e4m3fnuz elif dtype == torch.uint8: return DataType.uint8 else: diff --git a/python/test/test_fp8_accum.py b/python/test/test_fp8_accum.py index 82981ce1..ba33c085 100644 --- a/python/test/test_fp8_accum.py +++ b/python/test/test_fp8_accum.py @@ -21,6 +21,13 @@ from .mscclpp_mpi import MpiGroup, parametrize_mpi_groups, mpi_group # FP8 E4M3 (hardware) requires SM >= 89 (Ada / Hopper) on NVIDIA GPUs. # On AMD/ROCm (e.g. MI300X), FP8 is supported natively — no skip needed. _is_hip = hasattr(cp.cuda.runtime, "is_hip") and cp.cuda.runtime.is_hip +_gcn_arch_name = "" +if _is_hip: + _gcn_arch_name = cp.cuda.runtime.getDeviceProperties(0).get("gcnArchName", b"") + if isinstance(_gcn_arch_name, bytes): + _gcn_arch_name = _gcn_arch_name.decode() + _gcn_arch_name = _gcn_arch_name.split(":", maxsplit=1)[0] +_is_cdna4 = _gcn_arch_name.startswith("gfx95") _skip_fp8 = not _is_hip and int(cp.cuda.Device().compute_capability) < 89 pytestmark = pytest.mark.skipif(_skip_fp8, reason="FP8 accum tests require SM >= 89 on CUDA") @@ -90,7 +97,78 @@ def float_to_e4m3fn(f32_array, chunk_size=65536): # --------------------------------------------------------------------------- -# FP8 E4M3B15 helpers (bias=15, max=0.9375, NaN = exp==15 or bits==0x80) +# FP8 E4M3FNUZ helpers (AMD/ROCm; bias=8, max=240, NaN = bits==0x80, no -0) +# --------------------------------------------------------------------------- + + +def e4m3fnuz_to_float(uint8_array): + """Decode a cupy uint8 array of E4M3FNUZ bit patterns to float32.""" + bits = uint8_array.astype(cp.int32) + sign = (bits >> 7) & 1 + exp = (bits >> 3) & 0xF + mant = bits & 0x7 + + # Normal: (-1)^s * 2^(exp-8) * (1 + mant/8) + normal_val = cp.ldexp(cp.float32(1.0) + mant.astype(cp.float32) / cp.float32(8.0), (exp - 8).astype(cp.int32)) + # Subnormal (exp==0): (-1)^s * 2^(-7) * (mant/8) + subnormal_val = cp.ldexp(mant.astype(cp.float32) / cp.float32(8.0), cp.int32(-7)) + + result = cp.where(exp == 0, subnormal_val, normal_val) + result = cp.where(sign == 1, -result, result) + # Zero is only 0x00; the 0x80 encoding is reserved for NaN under fnuz. + result = cp.where(uint8_array.astype(cp.int32) == 0, cp.float32(0.0), result) + nan_mask = uint8_array.astype(cp.int32) == 0x80 + result = cp.where(nan_mask, cp.float32(float("nan")), result) + return result + + +def float_to_e4m3fnuz(f32_array, chunk_size=65536): + """Encode a cupy float32 array to uint8 E4M3FNUZ bit patterns. + + Same lookup-table approach as float_to_e4m3fn but using the fnuz table. + """ + all_bytes = cp.arange(128, dtype=cp.uint8) + all_floats = e4m3fnuz_to_float(all_bytes) + all_floats = cp.where(cp.isnan(all_floats), cp.float32(float("inf")), all_floats) + + clamped = f32_array.astype(cp.float32) + clamped = cp.clip(clamped, -240.0, 240.0) + signs = (clamped < 0).astype(cp.uint8) + absval = cp.abs(clamped) + + result = cp.zeros(absval.shape, dtype=cp.uint8) + n = absval.size + absval_flat = absval.ravel() + result_flat = result.ravel() + + for start in range(0, n, chunk_size): + end = min(start + chunk_size, n) + chunk = absval_flat[start:end] + diffs = cp.abs(chunk[:, None] - all_floats[None, :]) + result_flat[start:end] = cp.argmin(diffs, axis=1).astype(cp.uint8) + + result = result_flat.reshape(absval.shape) + result = result | (signs << 7) + # 0x80 is NaN under fnuz (no negative zero). Collapse any encoding that + # landed on 0x80 (small negatives quantised to zero magnitude) to 0x00. + result = cp.where(result == 0x80, cp.uint8(0), result) + return result + + +# Platform-aware E4M3 native helpers: ROCm CDNA4 and CUDA use OCP fn; older ROCm uses fnuz. +if _is_hip and not _is_cdna4: + e4m3_native_to_float = e4m3fnuz_to_float + float_to_e4m3_native = float_to_e4m3fnuz + fp8_native_dtype = DataType.float8_e4m3fnuz +else: + e4m3_native_to_float = e4m3fn_to_float + float_to_e4m3_native = float_to_e4m3fn + fp8_native_dtype = DataType.float8_e4m3fn + + +# --------------------------------------------------------------------------- +# FP8 E4M3B15 helpers (bias=15, encode saturates to ±1.75, no NaN) +# Matches Triton's fp8e4b15: all 256 bit patterns are finite. # --------------------------------------------------------------------------- @@ -108,11 +186,6 @@ def e4m3b15_to_float(uint8_array): result = cp.where(exp == 0, subnormal_val, normal_val) result = cp.where(sign == 1, -result, result) - # Zero - result = cp.where((exp == 0) & (mant == 0), cp.float32(0.0), result) - # NaN: exp==15 or negative zero (0x80) - nan_mask = (exp == 15) | (uint8_array.astype(cp.int32) == 0x80) - result = cp.where(nan_mask, cp.float32(float("nan")), result) return result @@ -120,18 +193,17 @@ def float_to_e4m3b15(f32_array, chunk_size=65536): """Encode a cupy float32 array to uint8 E4M3B15 bit patterns. Same lookup-table approach as float_to_e4m3fn. + Saturates to ±1.75 (0x7e/0xfe), matching Triton's fp8e4b15. """ # Build lookup table of all 128 positive E4M3B15 values (0x00..0x7F) all_bytes = cp.arange(128, dtype=cp.uint8) all_floats = e4m3b15_to_float(all_bytes) # (128,) float32 - # Mark NaN entries as inf so they're never selected as nearest - all_floats = cp.where(cp.isnan(all_floats), cp.float32(float("inf")), all_floats) - # Clamp input and extract sign - clamped = f32_array.astype(cp.float32) - clamped = cp.clip(clamped, -0.9375, 0.9375) - signs = (clamped < 0).astype(cp.uint8) - absval = cp.abs(clamped) + # Clamp input and extract sign. + values = f32_array.astype(cp.float32) + signs = cp.signbit(values).astype(cp.uint8) + absval = cp.abs(values) + absval = cp.clip(absval, cp.float32(0.0), cp.float32(1.75)) result = cp.zeros(absval.shape, dtype=cp.uint8) n = absval.size @@ -148,8 +220,6 @@ def float_to_e4m3b15(f32_array, chunk_size=65536): # Combine with sign bit result = result_flat.reshape(absval.shape) result = result | (signs << 7) - # Handle exact zero - result = cp.where(absval == 0, cp.uint8(0), result) return result @@ -226,12 +296,6 @@ def test_fp8_e4m3_accum(mpi_group: MpiGroup, algo_name: str, size: int): buf = GpuBuffer(size, dtype=cp.uint8) - accum_configs = [ - ("fp8_native", DataType.float8_e4m3), - ("float16", DataType.float16), - ("float32", DataType.float32), - ] - # rsag_zero_copy and fullmesh need explicit block/thread counts if "rsag" in algo_name: nb = max(1, min(32, size // (world_size * 32))) @@ -243,13 +307,19 @@ def test_fp8_e4m3_accum(mpi_group: MpiGroup, algo_name: str, size: int): nb = 0 nt = 0 + accum_configs = [ + ("fp8_native", fp8_native_dtype), + ("float16", DataType.float16), + ("float32", DataType.float32), + ] + errors = {} for accum_label, accum_dtype in accum_configs: # Generate deterministic per-rank data (use numpy to avoid hipRAND issues on ROCm) rng = np.random.RandomState(42 + rank) src_f32 = cp.asarray(rng.randn(size).astype(np.float32)) src_f32 = cp.clip(src_f32, -240.0, 240.0) - src_fp8 = float_to_e4m3fn(src_f32) + src_fp8 = float_to_e4m3_native(src_f32) # Copy into symmetric buffer buf[:] = src_fp8 @@ -260,12 +330,12 @@ def test_fp8_e4m3_accum(mpi_group: MpiGroup, algo_name: str, size: int): algo, comm_group, buf, - dtype=DataType.float8_e4m3, + dtype=fp8_native_dtype, accum_dtype=accum_dtype, nblocks=nb, nthreads_per_block=nt, ) - result_f32 = e4m3fn_to_float(result) + result_f32 = e4m3_native_to_float(result) # Compute float32 reference: sum all ranks' quantized FP8 inputs in float32 ref_f32 = cp.zeros(size, dtype=cp.float32) @@ -273,12 +343,13 @@ def test_fp8_e4m3_accum(mpi_group: MpiGroup, algo_name: str, size: int): rng_r = np.random.RandomState(42 + r) rank_data = cp.asarray(rng_r.randn(size).astype(np.float32)) rank_data = cp.clip(rank_data, -240.0, 240.0) - rank_data_fp8 = float_to_e4m3fn(rank_data) - ref_f32 += e4m3fn_to_float(rank_data_fp8) + rank_data_fp8 = float_to_e4m3_native(rank_data) + ref_f32 += e4m3_native_to_float(rank_data_fp8) - # Compute errors - abs_err = cp.abs(result_f32 - ref_f32) - mean_abs_err = float(cp.mean(abs_err)) + # Compute errors (only on valid, non-NaN entries) + valid = ~cp.isnan(result_f32) & ~cp.isnan(ref_f32) + abs_err = cp.abs(result_f32[valid] - ref_f32[valid]) + mean_abs_err = float(cp.mean(abs_err)) if abs_err.size > 0 else 0.0 errors[accum_label] = mean_abs_err # Reset between runs @@ -341,13 +412,10 @@ def test_fp8_e4m3b15_accum(mpi_group: MpiGroup, algo_name: str, size: int): errors = {} for accum_label, accum_dtype in accum_configs: - # Generate deterministic per-rank random uint8 values in valid e4m3b15 range + # Generate deterministic per-rank random uint8 values covering the full e4m3b15 range. + # All 256 bit patterns are valid (no NaN in this format). rng = np.random.RandomState(42 + rank) - raw = cp.asarray(rng.randint(0, 0x78, (size,)).astype(np.uint8)) - signs = cp.asarray(rng.randint(0, 2, (size,)).astype(np.uint8)) << 7 - src_uint8 = raw | signs - # Fix negative zero -> positive zero - src_uint8 = cp.where(src_uint8 == 0x80, cp.uint8(0), src_uint8) + src_uint8 = cp.asarray(rng.randint(0, 256, (size,)).astype(np.uint8)) # Copy into symmetric buffer buf[:] = src_uint8 @@ -371,19 +439,15 @@ def test_fp8_e4m3b15_accum(mpi_group: MpiGroup, algo_name: str, size: int): ref_f32 = cp.zeros(size, dtype=cp.float32) for r in range(world_size): rng_r = np.random.RandomState(42 + r) - raw_r = cp.asarray(rng_r.randint(0, 0x78, (size,)).astype(np.uint8)) - signs_r = cp.asarray(rng_r.randint(0, 2, (size,)).astype(np.uint8)) << 7 - bits_r = raw_r | signs_r - bits_r = cp.where(bits_r == 0x80, cp.uint8(0), bits_r) + bits_r = cp.asarray(rng_r.randint(0, 256, (size,)).astype(np.uint8)) ref_f32 += e4m3b15_to_float(bits_r) # Clamp reference to e4m3b15 representable range - ref_f32 = cp.clip(ref_f32, -0.9375, 0.9375) + ref_f32 = cp.clip(ref_f32, -1.75, 1.75) - # Compute errors (only on valid entries) - valid = ~cp.isnan(result_f32) & ~cp.isnan(ref_f32) - abs_err = cp.abs(result_f32[valid] - ref_f32[valid]) - mean_abs_err = float(cp.mean(abs_err)) if abs_err.size > 0 else 0.0 + # Compute errors + abs_err = cp.abs(result_f32 - ref_f32) + mean_abs_err = float(cp.mean(abs_err)) errors[accum_label] = mean_abs_err algo.reset() diff --git a/src/core/algorithm.cc b/src/core/algorithm.cc index ffa53aa8..c0713daa 100644 --- a/src/core/algorithm.cc +++ b/src/core/algorithm.cc @@ -3,6 +3,7 @@ #include #include +#include #include #include "logger.hpp" @@ -182,13 +183,41 @@ CommResult DslAlgorithm::execute(std::shared_ptr comm, const void* stream); break; #if defined(__FP8_TYPES_EXIST__) - case DataType::FLOAT8_E4M3: - executor->execute(rank, (__fp8_e4m3*)input, (__fp8_e4m3*)output, inputSize, outputSize, DataType::FLOAT8_E4M3, + case DataType::FLOAT8_E4M3FN: +#if defined(__FP8_E4M3_IS_FNUZ__) + THROW(EXEC, Error, ErrorCode::InvalidUsage, + "FLOAT8_E4M3FN is not natively supported on this platform; use FLOAT8_E4M3FNUZ"); +#else + executor->execute(rank, (__fp8_e4m3*)input, (__fp8_e4m3*)output, inputSize, outputSize, DataType::FLOAT8_E4M3FN, plan_, stream); +#endif + break; + case DataType::FLOAT8_E4M3FNUZ: +#if defined(__FP8_E4M3_IS_FNUZ__) + executor->execute(rank, (__fp8_e4m3*)input, (__fp8_e4m3*)output, inputSize, outputSize, DataType::FLOAT8_E4M3FNUZ, + plan_, stream); +#else + THROW(EXEC, Error, ErrorCode::InvalidUsage, + "FLOAT8_E4M3FNUZ is not natively supported on this platform; use FLOAT8_E4M3FN"); +#endif break; case DataType::FLOAT8_E5M2: +#if defined(__FP8_E5M2_IS_FNUZ__) + THROW(EXEC, Error, ErrorCode::InvalidUsage, + "FLOAT8_E5M2 is not natively supported on this platform; use FLOAT8_E5M2FNUZ"); +#else executor->execute(rank, (__fp8_e5m2*)input, (__fp8_e5m2*)output, inputSize, outputSize, DataType::FLOAT8_E5M2, plan_, stream); +#endif + break; + case DataType::FLOAT8_E5M2FNUZ: +#if defined(__FP8_E5M2_IS_FNUZ__) + executor->execute(rank, (__fp8_e5m2*)input, (__fp8_e5m2*)output, inputSize, outputSize, DataType::FLOAT8_E5M2FNUZ, + plan_, stream); +#else + THROW(EXEC, Error, ErrorCode::InvalidUsage, + "FLOAT8_E5M2FNUZ is not natively supported on this platform; use FLOAT8_E5M2"); +#endif break; #endif case DataType::FLOAT8_E4M3B15: @@ -230,4 +259,4 @@ std::pair, size_t> getFlagBuffer() { return {ptr, gDefaultFlagCount * sizeof(uint32_t)}; } -} // namespace mscclpp \ No newline at end of file +} // namespace mscclpp diff --git a/src/core/executor/execution_kernel.cu b/src/core/executor/execution_kernel.cu index 28ced77f..d639efb7 100644 --- a/src/core/executor/execution_kernel.cu +++ b/src/core/executor/execution_kernel.cu @@ -78,8 +78,10 @@ void ExecutionKernel::launchKernel(int rank, int nthreadblocks, int nthreads, vo ); #endif break; - case DataType::FLOAT8_E4M3: + case DataType::FLOAT8_E4M3FN: + case DataType::FLOAT8_E4M3FNUZ: case DataType::FLOAT8_E5M2: + case DataType::FLOAT8_E5M2FNUZ: // FP8 is not supported in CUDA execution kernel. break; case DataType::FLOAT8_E4M3B15: diff --git a/src/core/executor/executor.cc b/src/core/executor/executor.cc index bf2caf97..fcecc4dd 100644 --- a/src/core/executor/executor.cc +++ b/src/core/executor/executor.cc @@ -7,7 +7,6 @@ #include #include -#include "debug.h" #include "execution_kernel.hpp" #include "execution_plan.hpp" @@ -509,8 +508,7 @@ Executor::Executor(std::shared_ptr comm, std::shared_ptr def void Executor::execute(int rank, void* sendbuff, void* recvbuff, size_t sendBuffSize, [[maybe_unused]] size_t recvBuffSize, DataType dataType, const ExecutionPlan& plan, cudaStream_t stream, PacketType packetType) { - INFO(MSCCLPP_EXECUTOR, "Starting execution with plan: %s, collective: %s", plan.name().c_str(), - plan.collective().c_str()); + INFO(LogSubsys::EXEC, "Starting execution with plan: ", plan.name(), ", collective: ", plan.collective()); size_t sendMemRange, recvMemRange; CUdeviceptr sendBasePtr, recvBasePtr; MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendMemRange, (CUdeviceptr)sendbuff)); diff --git a/src/core/include/execution_kernel.hpp b/src/core/include/execution_kernel.hpp index 87b88888..cb808bc8 100644 --- a/src/core/include/execution_kernel.hpp +++ b/src/core/include/execution_kernel.hpp @@ -17,6 +17,7 @@ #include #include "execution_common.hpp" +#include "logger.hpp" #include "reduce_kernel.hpp" namespace mscclpp { @@ -876,7 +877,19 @@ class ExecutionKernel { #endif break; #if defined(__FP8_TYPES_EXIST__) - case DataType::FLOAT8_E4M3: + case DataType::FLOAT8_E4M3FN: + case DataType::FLOAT8_E4M3FNUZ: +#if defined(__FP8_E4M3_IS_FNUZ__) + if (dataType == DataType::FLOAT8_E4M3FN) { + THROW(LogSubsys::EXEC, Error, ErrorCode::InvalidUsage, + "FLOAT8_E4M3FN is not natively supported on this platform; use FLOAT8_E4M3FNUZ"); + } +#else + if (dataType == DataType::FLOAT8_E4M3FNUZ) { + THROW(LogSubsys::EXEC, Error, ErrorCode::InvalidUsage, + "FLOAT8_E4M3FNUZ is not natively supported on this platform; use FLOAT8_E4M3FN"); + } +#endif executionKernel<__fp8_e4m3, PacketType, ReuseScratch><<>>( rank, (__fp8_e4m3*)src, (__fp8_e4m3*)dst, (__fp8_e4m3*)scratch, scratchOffset, scratchChunkSize, plan, semaphores, localMemoryIdBegin, flag @@ -888,6 +901,18 @@ class ExecutionKernel { #endif break; case DataType::FLOAT8_E5M2: + case DataType::FLOAT8_E5M2FNUZ: +#if defined(__FP8_E5M2_IS_FNUZ__) + if (dataType == DataType::FLOAT8_E5M2) { + THROW(LogSubsys::EXEC, Error, ErrorCode::InvalidUsage, + "FLOAT8_E5M2 is not natively supported on this platform; use FLOAT8_E5M2FNUZ"); + } +#else + if (dataType == DataType::FLOAT8_E5M2FNUZ) { + THROW(LogSubsys::EXEC, Error, ErrorCode::InvalidUsage, + "FLOAT8_E5M2FNUZ is not natively supported on this platform; use FLOAT8_E5M2"); + } +#endif executionKernel<__fp8_e5m2, PacketType, ReuseScratch><<>>( rank, (__fp8_e5m2*)src, (__fp8_e5m2*)dst, (__fp8_e5m2*)scratch, scratchOffset, scratchChunkSize, plan, semaphores, localMemoryIdBegin, flag diff --git a/src/ext/collectives/allreduce/allreduce_packet.cu b/src/ext/collectives/allreduce/allreduce_packet.cu index e2d8ef73..6199f192 100644 --- a/src/ext/collectives/allreduce/allreduce_packet.cu +++ b/src/ext/collectives/allreduce/allreduce_packet.cu @@ -200,7 +200,8 @@ inline std::pair getDefaultBlockNumAndThreadNum(size_t inputSize, int { bool isFp8 = dtype == DataType::FLOAT8_E4M3B15; #if defined(__FP8_TYPES_EXIST__) - isFp8 = isFp8 || dtype == DataType::FLOAT8_E4M3 || dtype == DataType::FLOAT8_E5M2; + isFp8 = isFp8 || dtype == DataType::FLOAT8_E4M3FN || dtype == DataType::FLOAT8_E4M3FNUZ || + dtype == DataType::FLOAT8_E5M2 || dtype == DataType::FLOAT8_E5M2FNUZ; #endif if (isFp8) { if (inputSize < (64 << 10)) { @@ -310,4 +311,4 @@ std::shared_ptr AllreducePacket::build() { } } // namespace collective -} // namespace mscclpp \ No newline at end of file +} // namespace mscclpp diff --git a/src/ext/collectives/include/allreduce/common.hpp b/src/ext/collectives/include/allreduce/common.hpp index 1e0e7e69..93b18e26 100644 --- a/src/ext/collectives/include/allreduce/common.hpp +++ b/src/ext/collectives/include/allreduce/common.hpp @@ -101,10 +101,20 @@ AllreduceFunc dispatchByDtype(mscclpp::DataType dtype, mscclpp::DataType accumDt return Adapter::call; #endif #if defined(__FP8_TYPES_EXIST__) - } else if (dtype == mscclpp::DataType::FLOAT8_E4M3) { +#if defined(__FP8_E4M3_IS_FNUZ__) + } else if (dtype == mscclpp::DataType::FLOAT8_E4M3FNUZ) { return dispatchFp8Accum(accumDtype, dtype); +#else + } else if (dtype == mscclpp::DataType::FLOAT8_E4M3FN) { + return dispatchFp8Accum(accumDtype, dtype); +#endif +#if defined(__FP8_E5M2_IS_FNUZ__) + } else if (dtype == mscclpp::DataType::FLOAT8_E5M2FNUZ) { + return dispatchFp8Accum(accumDtype, dtype); +#else } else if (dtype == mscclpp::DataType::FLOAT8_E5M2) { return dispatchFp8Accum(accumDtype, dtype); +#endif #endif } else if (dtype == mscclpp::DataType::FLOAT8_E4M3B15) { return dispatchFp8Accum(accumDtype, dtype); @@ -125,4 +135,4 @@ AllreduceFunc dispatch(ReduceOp op, mscclpp::DataType dtype, mscclpp::DataType a } // namespace collective } // namespace mscclpp -#endif // MSCCLPP_ALLREDUCE_COMMON_HPP_ \ No newline at end of file +#endif // MSCCLPP_ALLREDUCE_COMMON_HPP_ diff --git a/src/ext/nccl/algorithm_selector.cc b/src/ext/nccl/algorithm_selector.cc index 0b9592d7..c94aab34 100644 --- a/src/ext/nccl/algorithm_selector.cc +++ b/src/ext/nccl/algorithm_selector.cc @@ -20,7 +20,8 @@ static bool isNvlsSupportedForDataType(const AlgorithmSelectorConfig& config, Da return false; } - const bool isFp8 = dtype == DataType::FLOAT8_E4M3 || dtype == DataType::FLOAT8_E5M2; + const bool isFp8 = dtype == DataType::FLOAT8_E4M3FN || dtype == DataType::FLOAT8_E4M3FNUZ || + dtype == DataType::FLOAT8_E5M2 || dtype == DataType::FLOAT8_E5M2FNUZ; if (!isFp8) { return nvlsSupported; diff --git a/src/ext/nccl/datatype_conversion.hpp b/src/ext/nccl/datatype_conversion.hpp index dcfb645a..a5c74def 100644 --- a/src/ext/nccl/datatype_conversion.hpp +++ b/src/ext/nccl/datatype_conversion.hpp @@ -28,12 +28,21 @@ inline mscclpp::DataType ncclDataTypeToMscclpp(ncclDataType_t dtype) { return mscclpp::DataType::BFLOAT16; #ifdef __FP8_TYPES_EXIST__ case ncclFloat8e4m3: - return mscclpp::DataType::FLOAT8_E4M3; +#if defined(__FP8_E4M3_IS_FNUZ__) + return mscclpp::DataType::FLOAT8_E4M3FNUZ; +#else + return mscclpp::DataType::FLOAT8_E4M3FN; +#endif case ncclFloat8e5m2: +#if defined(__FP8_E5M2_IS_FNUZ__) + return mscclpp::DataType::FLOAT8_E5M2FNUZ; +#else return mscclpp::DataType::FLOAT8_E5M2; +#endif #endif default: - throw mscclpp::Error("Unsupported ncclDataType_t: " + std::to_string(dtype), mscclpp::ErrorCode::InvalidUsage); + THROW(mscclpp::LogSubsys::NCCL, mscclpp::Error, mscclpp::ErrorCode::InvalidUsage, + "Unsupported ncclDataType_t: " + std::to_string(dtype)); } } @@ -41,8 +50,10 @@ inline mscclpp::DataType ncclDataTypeToMscclpp(ncclDataType_t dtype) { inline size_t getDataTypeSize(mscclpp::DataType dtype) { switch (dtype) { case mscclpp::DataType::UINT8: - case mscclpp::DataType::FLOAT8_E4M3: + case mscclpp::DataType::FLOAT8_E4M3FN: + case mscclpp::DataType::FLOAT8_E4M3FNUZ: case mscclpp::DataType::FLOAT8_E5M2: + case mscclpp::DataType::FLOAT8_E5M2FNUZ: case mscclpp::DataType::FLOAT8_E4M3B15: return 1; case mscclpp::DataType::FLOAT16: @@ -72,10 +83,32 @@ static inline ncclDataType_t mscclppToNcclDataType(mscclpp::DataType dtype) { case mscclpp::DataType::BFLOAT16: return ncclBfloat16; #ifdef __FP8_TYPES_EXIST__ - case mscclpp::DataType::FLOAT8_E4M3: +#if defined(__FP8_E4M3_IS_FNUZ__) + case mscclpp::DataType::FLOAT8_E4M3FNUZ: return ncclFloat8e4m3; + case mscclpp::DataType::FLOAT8_E4M3FN: + THROW(mscclpp::LogSubsys::NCCL, mscclpp::Error, mscclpp::ErrorCode::InvalidUsage, + "FLOAT8_E4M3FN is not natively supported on this platform; use FLOAT8_E4M3FNUZ for NCCL collectives"); +#else + case mscclpp::DataType::FLOAT8_E4M3FN: + return ncclFloat8e4m3; + case mscclpp::DataType::FLOAT8_E4M3FNUZ: + THROW(mscclpp::LogSubsys::NCCL, mscclpp::Error, mscclpp::ErrorCode::InvalidUsage, + "FLOAT8_E4M3FNUZ is not natively supported on this platform; use FLOAT8_E4M3FN for NCCL collectives"); +#endif +#if defined(__FP8_E5M2_IS_FNUZ__) + case mscclpp::DataType::FLOAT8_E5M2FNUZ: + return ncclFloat8e5m2; + case mscclpp::DataType::FLOAT8_E5M2: + THROW(mscclpp::LogSubsys::NCCL, mscclpp::Error, mscclpp::ErrorCode::InvalidUsage, + "FLOAT8_E5M2 is not natively supported on this platform; use FLOAT8_E5M2FNUZ for NCCL collectives"); +#else case mscclpp::DataType::FLOAT8_E5M2: return ncclFloat8e5m2; + case mscclpp::DataType::FLOAT8_E5M2FNUZ: + THROW(mscclpp::LogSubsys::NCCL, mscclpp::Error, mscclpp::ErrorCode::InvalidUsage, + "FLOAT8_E5M2FNUZ is not natively supported on this platform; use FLOAT8_E5M2 for NCCL collectives"); +#endif #endif case mscclpp::DataType::FLOAT8_E4M3B15: // float8_e4m3b15 has no NCCL equivalent; NCCL cannot reduce this type correctly. @@ -98,4 +131,4 @@ inline mscclpp::ReduceOp ncclRedOpToMscclpp(ncclRedOp_t op) { } } -#endif // MSCCLPP_DATATYPE_CONVERSION_HPP_ \ No newline at end of file +#endif // MSCCLPP_DATATYPE_CONVERSION_HPP_ From 9ec26fa4d11325ca33dd4dca83b99dee9146e6bf Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Mon, 4 May 2026 15:11:47 -0700 Subject: [PATCH 04/12] Reset GPU tokens before reuse (#795) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes a token-reuse bug in `TokenPool` that's independent of MNNVL. ## Bug `TokenPool` hands out 8-byte device-memory slots used as device-semaphore counters. The deleter only cleared the bitmap — the underlying GPU memory was left as-is. When a token was freed and later re-allocated, the new semaphore inherited the previous counter value instead of starting at 0, breaking subsequent `signal()/wait()` math. ## Fix * Add a synchronous `gpuMemset` host helper (mirrors `gpuMemcpy` / `gpuMemcpyAsync`). * Zero the slot inside the `TokenPool` deleter so recycled tokens hand out a clean counter. The very-first allocation is already zeroed by `gpuCallocPhysical` (`src/core/gpu_utils.cc:227-228`), so first-time tokens are also clean — the deleter only has to handle the recycle case. ## Notes * Public wrapper is named `mscclpp::gpuMemset` (not `mscclpp::memset`) for symmetry with `gpuMemcpy` and to avoid shadowing `std::memset` in TUs that pull the namespace in. * Zeroing happens on release rather than acquire so the cost is paid in the typically less perf-sensitive teardown path. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --------- Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- include/mscclpp/gpu_utils.hpp | 7 +++++++ src/core/gpu_utils.cc | 7 +++++++ src/core/utils_internal.cc | 3 +++ 3 files changed, 17 insertions(+) diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp index ecd13c47..b079e0fd 100644 --- a/include/mscclpp/gpu_utils.hpp +++ b/include/mscclpp/gpu_utils.hpp @@ -165,6 +165,7 @@ void gpuFreePhysical(void* ptr); void gpuMemcpyAsync(void* dst, const void* src, size_t bytes, cudaStream_t stream, cudaMemcpyKind kind = cudaMemcpyDefault); void gpuMemcpy(void* dst, const void* src, size_t bytes, cudaMemcpyKind kind = cudaMemcpyDefault); +void gpuMemset(void* ptr, int value, size_t bytes); /// A template function that allocates memory while ensuring that the memory will be freed when the returned object is /// destroyed. @@ -300,6 +301,12 @@ void gpuMemcpy(T* dst, const T* src, size_t nelems, cudaMemcpyKind kind = cudaMe detail::gpuMemcpy(dst, src, nelems * sizeof(T), kind); } +/// Sets `bytes` of memory at `ptr` to `value` synchronously. +/// @param ptr Destination address. +/// @param value Value to set (interpreted as unsigned char per CUDA semantics). +/// @param bytes Number of bytes to set. +inline void gpuMemset(void* ptr, int value, size_t bytes) { detail::gpuMemset(ptr, value, bytes); } + /// Check if NVLink SHARP (NVLS) is supported. /// /// @return True if NVLink SHARP (NVLS) is supported, false otherwise. diff --git a/src/core/gpu_utils.cc b/src/core/gpu_utils.cc index 09d5025d..1ce61322 100644 --- a/src/core/gpu_utils.cc +++ b/src/core/gpu_utils.cc @@ -267,6 +267,13 @@ void gpuMemcpy(void* dst, const void* src, size_t bytes, cudaMemcpyKind kind) { MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream)); } +void gpuMemset(void* ptr, int value, size_t bytes) { + AvoidCudaGraphCaptureGuard cgcGuard; + CudaStreamWithFlags stream(cudaStreamNonBlocking); + MSCCLPP_CUDATHROW(cudaMemsetAsync(ptr, value, bytes, stream)); + MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream)); +} + } // namespace detail bool isNvlsSupported() { diff --git a/src/core/utils_internal.cc b/src/core/utils_internal.cc index 9504a52c..8cc55430 100644 --- a/src/core/utils_internal.cc +++ b/src/core/utils_internal.cc @@ -248,6 +248,9 @@ TokenPool::TokenPool(size_t nToken) : nToken_(nToken) { std::shared_ptr TokenPool::getToken() { auto deleter = [self = shared_from_this()](uint64_t* token) { + // Zero the slot on release so the next allocator hands out a clean + // semaphore counter (matches a freshly-allocated slot). + mscclpp::gpuMemset(token, 0, sizeof(uint64_t)); size_t index = (token - self->baseAddr_) / UINT64_WIDTH; size_t bit = (token - self->baseAddr_) % UINT64_WIDTH; uint64_t mask = 1UL << bit; From 822fbb235168d3bf85112bdf53a55cbfc86e4737 Mon Sep 17 00:00:00 2001 From: Mahdieh Ghazi Date: Tue, 5 May 2026 17:17:41 -0400 Subject: [PATCH 05/12] Adding necessary macros for enabling mrc support (#797) This PR adds necessary macros and instructions for enabling mrc support with no atomic. --- CMakeLists.txt | 1 + docs/quickstart.md | 39 +++++++++++++++++++++++++++++++++++++ src/core/CMakeLists.txt | 4 ++++ src/core/ibverbs_wrapper.cc | 36 ++++++++++++++++++++++++++++++++++ 4 files changed, 80 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index ef8b785a..49154e0b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -54,6 +54,7 @@ option(MSCCLPP_BUILD_EXT_COLLECTIVES "Build collective algorithms" ON) option(MSCCLPP_USE_CUDA "Use NVIDIA/CUDA." OFF) option(MSCCLPP_USE_ROCM "Use AMD/ROCm." OFF) option(MSCCLPP_USE_IB "Use InfiniBand." ON) +option(MSCCLPP_USE_MRC "Enable MRC support" OFF) option(MSCCLPP_BYPASS_GPU_CHECK "Bypass GPU check." OFF) option(MSCCLPP_NPKIT_FLAGS "Set NPKIT flags" OFF) option(MSCCLPP_ENABLE_COVERAGE "Enable code coverage" OFF) diff --git a/docs/quickstart.md b/docs/quickstart.md index 83a08d6a..716fcf61 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -126,6 +126,45 @@ $ python -m pip install ".[cuda12,benchmark]" $ python -m pip install ".[cuda12,benchmark,test]" ``` +(mrc-support)= +## MRC Support + +MSCCL++ supports execution over **Multi-path Reliable Connection (MRC)**, which enables the use of multiple network paths to improve bandwidth utilization and resilience. + +To enable MRC support, you must configure both the **build-time** and **runtime** environments as described below. + +--- + +### 1. Install MRC Verbs Shim + +MSCCL++ relies on a custom verbs shim library that intercepts standard `libibverbs` calls and redirects them to an MRC-enabled implementation. + +- Install the [MRC verbs shim library](https://github.com/microsoft/mrc-verbs-shim-lib) on all nodes in the cluster. +- Ensure that the underlying system has MRC support enabled. + +--- + +### 2. Build MSCCL++ with MRC Enabled + +Enable MRC support during the build by adding the following CMake option: + +```bash +-DMSCCLPP_USE_MRC=ON +``` + +This configures MSCCL++ to use the MRC-enabled verbs layer at runtime. + +### 3. Configure Runtime Environment + +At runtime, you must configure environment variables to override the default RDMA libraries and link against the MRC-enabled stack: + +```bash +-x MSCCLPP_IBV_SO=:$MRC-SHIM-HOME/libibverbs.so +-x LD_LIBRARY_PATH=$MRC-SHIM-HOME/mrc-header-lib:$LD_LIBRARY_PATH +-x VMRC_LIBMRC_SO=/opt/mellanox/doca/lib/aarch64-linux-gnu/libnv_mrc.so" +-x VMRC_LIBIBVERBS_SO=/lib/aarch64-linux-gnu/libibverbs.so.1 +``` + (vscode-dev-container)= ## VSCode Dev Container diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index 9ca5fed3..5b89eedc 100644 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -59,6 +59,10 @@ if(MSCCLPP_NPKIT_FLAGS) target_compile_definitions(mscclpp_obj PRIVATE ${MSCCLPP_NPKIT_FLAGS}) endif() +if(MSCCLPP_USE_MRC) + target_compile_definitions(mscclpp_obj PRIVATE MSCCLPP_USE_MRC) +endif() + # libmscclpp add_library(mscclpp SHARED) target_link_libraries(mscclpp PUBLIC mscclpp_obj) diff --git a/src/core/ibverbs_wrapper.cc b/src/core/ibverbs_wrapper.cc index 51f3f29c..60ee0694 100644 --- a/src/core/ibverbs_wrapper.cc +++ b/src/core/ibverbs_wrapper.cc @@ -10,11 +10,28 @@ #include "logger.hpp" +// Adding MSCCLPP_USE_MRC micro for MRC enablement. +// Non-MRC environments will not be affected by this macro as long as VMRC_LIBIBVERBS_SO +// environment variable is not set. +#if (MSCCLPP_USE_MRC) +#include +#include +#endif // (MSCCLPP_USE_MRC) + namespace mscclpp { static std::unique_ptr globalIBVerbsHandle(nullptr, &::dlclose); +#if (MSCCLPP_USE_MRC) +static std::unique_ptr globalOrigIBVerbsHandle(nullptr, &::dlclose); +#endif // (MSCCLPP_USE_MRC) void* IBVerbs::dlsym(const std::string& symbol, bool allowReturnNull) { +#if (MSCCLPP_USE_MRC) + static std::set mrcSymbols = { + "ibv_get_device_list", "ibv_get_device_name", "ibv_open_device", "ibv_close_device", "ibv_query_qp", + "ibv_create_cq", "ibv_destroy_cq", "ibv_create_qp", "ibv_modify_qp", "ibv_destroy_qp", + }; +#endif // (MSCCLPP_USE_MRC) if (!globalIBVerbsHandle) { if (mscclpp::env()->ibvSo != "") { void* handle = ::dlopen(mscclpp::env()->ibvSo.c_str(), RTLD_NOW); @@ -38,7 +55,26 @@ void* IBVerbs::dlsym(const std::string& symbol, bool allowReturnNull) { THROW(NET, SysError, errno, "Failed to open libibverbs: ", std::string(::dlerror())); } } +#if (MSCCLPP_USE_MRC) + // In MRC mode, `VMRC_LIBIBVERBS_SO` should be set. + char* vmrcLibibverbsSo = ::getenv("VMRC_LIBIBVERBS_SO"); + void* ptr; + if (vmrcLibibverbsSo != nullptr && mrcSymbols.find(symbol) == mrcSymbols.end()) { + // If we are in MRC mode and the symbol is not in the table, get it from the original libibverbs. + if (!globalOrigIBVerbsHandle) { + void* handle = ::dlopen(vmrcLibibverbsSo, RTLD_NOW); + if (!handle) { + THROW(NET, SysError, errno, "Failed to open ", std::string(vmrcLibibverbsSo)); + } + globalOrigIBVerbsHandle.reset(handle); + } + ptr = ::dlsym(globalOrigIBVerbsHandle.get(), symbol.c_str()); + } else { + ptr = ::dlsym(globalIBVerbsHandle.get(), symbol.c_str()); + } +#else // !(MSCCLPP_USE_MRC) void* ptr = ::dlsym(globalIBVerbsHandle.get(), symbol.c_str()); +#endif // !(MSCCLPP_USE_MRC) if (!ptr && !allowReturnNull) { THROW(NET, SysError, errno, "Failed to load libibverbs symbol: ", symbol); } From 0c9b9abfd512c51e5df4d9b27f39627c3693013b Mon Sep 17 00:00:00 2001 From: Caio Rocha <164253795+caiomcbr@users.noreply.github.com> Date: Tue, 12 May 2026 13:45:55 -0700 Subject: [PATCH 06/12] Adding Support 4 Nodes AllReduce Small Message Size (#794) Results on 4 Nodes H200: | Size | NCCL | MSCCL++ 57TB | MSCCL++ 29TB | |------|-------|--------------|--------------| | 8K | 45.75 | 17.74 | 18.18 | | 16K | 47.08 | 18.9 | 18.42 | | 32K | 47.29 | 19.48 | 19.12 | | 64K | 50.34 | 20.51 | 19.29 | | 128K | 59.65 | 21.37 | 20.25 | | 256K | 87.46 | 23.87 | 23.51 | | 512K | 106.55| 29.15 | 29.51 | | 1M | 115 | 40.64 | 41.83 | | 2M | 135.89| 63.73 | 70.45 | | 4M | 177.59| 121.76 | 128.79 | | 8M | 251.17| 228.5 | 251.36 | --------- Co-authored-by: Binyang Li Co-authored-by: Caio Rocha --- python/mscclpp/__main__.py | 46 ++++++- python/mscclpp/default_algos/__init__.py | 4 +- ...uce_2nodes.py => allreduce_multi_nodes.py} | 124 +++++++++++++----- .../algorithm_collection_builder.cc | 4 +- 4 files changed, 143 insertions(+), 35 deletions(-) rename python/mscclpp/default_algos/{allreduce_2nodes.py => allreduce_multi_nodes.py} (61%) diff --git a/python/mscclpp/__main__.py b/python/mscclpp/__main__.py index 6a6f5f28..450ec748 100644 --- a/python/mscclpp/__main__.py +++ b/python/mscclpp/__main__.py @@ -13,7 +13,7 @@ from mscclpp.language.utils import AlgoSpec default_algo_configs = [ { "filename": "allreduce_2nodes_1K_64K.json", - "function": def_algo.allreduce_2nodes, + "function": def_algo.allreduce_multi_nodes, "spec": AlgoSpec( name="allreduce_2nodes_1K_64K", collective=AllReduce(16, 1, True), @@ -34,7 +34,7 @@ default_algo_configs = [ }, { "filename": "allreduce_2nodes_128K_2M.json", - "function": def_algo.allreduce_2nodes, + "function": def_algo.allreduce_multi_nodes, "spec": AlgoSpec( name="allreduce_2nodes_128K_2M", collective=AllReduce(16, 1, True), @@ -53,6 +53,48 @@ default_algo_configs = [ ), "additional_kwargs": {"thread_block_group_size": 4}, }, + { + "filename": "allreduce_4nodes_1K_64K.json", + "function": def_algo.allreduce_multi_nodes, + "spec": AlgoSpec( + name="allreduce_4nodes_1K_64K", + collective=AllReduce(32, 1, True), + nranks_per_node=8, + world_size=32, + in_place=True, + instances=1, + protocol="LL", + auto_sync=False, + num_threads_per_block=1024, + reuse_resources=True, + use_double_scratch_buffer=True, + min_message_size=1 << 10, + max_message_size=64 << 10, + tags={"default": 1}, + ), + "additional_kwargs": {"thread_block_group_size": 1}, + }, + { + "filename": "allreduce_4nodes_128K_2M.json", + "function": def_algo.allreduce_multi_nodes, + "spec": AlgoSpec( + name="allreduce_4nodes_128K_2M", + collective=AllReduce(32, 1, True), + nranks_per_node=8, + world_size=32, + in_place=True, + instances=1, + protocol="LL", + auto_sync=False, + num_threads_per_block=1024, + reuse_resources=True, + use_double_scratch_buffer=True, + min_message_size=128 << 10, + max_message_size=2 << 20, + tags={"default": 1}, + ), + "additional_kwargs": {"thread_block_group_size": 4}, + }, ] diff --git a/python/mscclpp/default_algos/__init__.py b/python/mscclpp/default_algos/__init__.py index a5cfa882..1767aab6 100644 --- a/python/mscclpp/default_algos/__init__.py +++ b/python/mscclpp/default_algos/__init__.py @@ -1,6 +1,6 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. -from mscclpp.default_algos.allreduce_2nodes import allreduce_2nodes +from mscclpp.default_algos.allreduce_multi_nodes import allreduce_multi_nodes -__all__ = ["allreduce_2nodes"] +__all__ = ["allreduce_multi_nodes"] diff --git a/python/mscclpp/default_algos/allreduce_2nodes.py b/python/mscclpp/default_algos/allreduce_multi_nodes.py similarity index 61% rename from python/mscclpp/default_algos/allreduce_2nodes.py rename to python/mscclpp/default_algos/allreduce_multi_nodes.py index 5a355887..5697a0e3 100644 --- a/python/mscclpp/default_algos/allreduce_2nodes.py +++ b/python/mscclpp/default_algos/allreduce_multi_nodes.py @@ -2,9 +2,11 @@ # Licensed under the MIT License. """ -Multi-node AllReduce implementation using packet-based communication. -This implements a hierarchical AllReduce: intra-node allreduce followed by -inter-node exchange and final intra-node allreduce. +Generalized multi-node AllReduce implementation using packet-based communication. +This implements a hierarchical AllReduce for N nodes: +1. Intra-node reduce-scatter (each GPU reduces its assigned chunk across the node) +2. Inter-node allreduce (exchange fully intra-reduced chunks across all nodes) +3. Intra-node broadcast (distribute the fully reduced chunks back to all GPUs in the node) """ from mscclpp.language.utils import AlgoSpec @@ -15,7 +17,7 @@ from mscclpp.language.program import * from mscclpp.language.collectives import * -def allreduce_2nodes(spec: AlgoSpec, thread_block_group_size: int) -> CollectiveProgram: +def allreduce_multi_nodes(spec: AlgoSpec, thread_block_group_size: int) -> CollectiveProgram: """ Implements a multi-node AllReduce using a hierarchical approach: 1. Intra-node allreduce @@ -23,10 +25,10 @@ def allreduce_2nodes(spec: AlgoSpec, thread_block_group_size: int) -> Collective 3. Intra-node allreduce """ # Configuration constants - num_nodes = 2 + num_nodes = spec.world_size // spec.nranks_per_node gpus_per_node = spec.nranks_per_node total_gpus = num_nodes * gpus_per_node - packets_per_gpu = 2 + packets_per_gpu = num_nodes with CollectiveProgram.from_spec(spec) as prog: # Initialize communication channels and buffers @@ -54,11 +56,21 @@ def allreduce_2nodes(spec: AlgoSpec, thread_block_group_size: int) -> Collective ) ) - scratch_buffer_size = packets_per_gpu * (total_gpus + 1) + # Scratch buffer layout (3 contiguous regions): + # Region 1 [0, total_gpus): + # Intra-node reduce-scatter. Each GPU receives chunks from gpus_per_node peers, + # packets_per_gpu each → gpus_per_node * packets_per_gpu = total_gpus slots. + # Region 2 [total_gpus, total_gpus + num_nodes * packets_per_gpu): + # Inter-node exchange. Each GPU receives reduced chunks from num_nodes nodes, + # packets_per_gpu each → num_nodes * packets_per_gpu slots. + # Region 3 [total_gpus + num_nodes * packets_per_gpu, end): + # Intra-node broadcast. Each GPU receives final reduced data from gpus_per_node peers, + # packets_per_gpu each → gpus_per_node * packets_per_gpu = total_gpus slots. + # Total = 2 * total_gpus + num_nodes * packets_per_gpu + scratch_buffer_size = 2 * total_gpus + packets_per_gpu * num_nodes for node_id in range(num_nodes): for local_gpu_id in range(gpus_per_node): current_rank_id = local_gpu_id + gpus_per_node * node_id - next_node_rank_id = (local_gpu_id + gpus_per_node * (node_id + 1)) % total_gpus scratch_buffers.append(Buffer(current_rank_id, scratch_buffer_size)) for peer_gpu_id in range(gpus_per_node): if peer_gpu_id != local_gpu_id: @@ -66,7 +78,12 @@ def allreduce_2nodes(spec: AlgoSpec, thread_block_group_size: int) -> Collective intra_node_memory_channels[(peer_rank_id, current_rank_id)] = MemoryChannel( peer_rank_id, current_rank_id ) - inter_node_port_channels[current_rank_id] = PortChannel(next_node_rank_id, current_rank_id) + for peer_node_id in range(num_nodes): + if peer_node_id != node_id: + peer_node_rank_id = (local_gpu_id + gpus_per_node * peer_node_id) % total_gpus + inter_node_port_channels[(current_rank_id, peer_node_rank_id)] = PortChannel( + peer_node_rank_id, current_rank_id + ) # AllReduce for node_id in range(num_nodes): @@ -74,7 +91,6 @@ def allreduce_2nodes(spec: AlgoSpec, thread_block_group_size: int) -> Collective current_rank_id = local_gpu_id + gpus_per_node * node_id current_rank = Rank(current_rank_id) input_buffer = current_rank.get_input_buffer() - next_node_rank_id = (local_gpu_id + gpus_per_node * (node_id + 1)) % total_gpus # Intra Node Exchange Data for peer_gpu_id in range(gpus_per_node): @@ -118,27 +134,32 @@ def allreduce_2nodes(spec: AlgoSpec, thread_block_group_size: int) -> Collective ) inter_node_offset = total_gpus - inter_node_port_channels[current_rank_id].put_packets( - scratch_buffers[next_node_rank_id][ - inter_node_offset - + local_gpu_id * packets_per_gpu : inter_node_offset - + local_gpu_id * packets_per_gpu - + packets_per_gpu - ], - scratch_buffers[current_rank_id][ - local_gpu_id * packets_per_gpu : local_gpu_id * packets_per_gpu + packets_per_gpu - ], - tb=0, - ) + for peer_node_id in range(num_nodes): + if peer_node_id != node_id: + peer_node_rank_id = (local_gpu_id + gpus_per_node * peer_node_id) % total_gpus + inter_node_port_channels[(current_rank_id, peer_node_rank_id)].put_packets( + scratch_buffers[peer_node_rank_id][ + inter_node_offset + + node_id * packets_per_gpu : inter_node_offset + + node_id * packets_per_gpu + + packets_per_gpu + ], + scratch_buffers[current_rank_id][ + local_gpu_id * packets_per_gpu : local_gpu_id * packets_per_gpu + packets_per_gpu + ], + tb=0, + ) # Reduce Received Data from Remote Node inter_node_data = [ scratch_buffers[current_rank_id][ inter_node_offset - + local_gpu_id * packets_per_gpu : inter_node_offset - + local_gpu_id * packets_per_gpu + + peer_node_id * packets_per_gpu : inter_node_offset + + peer_node_id * packets_per_gpu + packets_per_gpu ] + for peer_node_id in range(num_nodes) + if peer_node_id != node_id ] current_rank.reduce( input_buffer[local_gpu_id * packets_per_gpu : local_gpu_id * packets_per_gpu + packets_per_gpu], @@ -148,12 +169,18 @@ def allreduce_2nodes(spec: AlgoSpec, thread_block_group_size: int) -> Collective ) current_rank.copy_packets( - scratch_buffers[current_rank_id][scratch_buffer_size - packets_per_gpu : scratch_buffer_size], + scratch_buffers[current_rank_id][ + inter_node_offset + + node_id * packets_per_gpu : inter_node_offset + + node_id * packets_per_gpu + + packets_per_gpu + ], input_buffer[local_gpu_id * packets_per_gpu : local_gpu_id * packets_per_gpu + packets_per_gpu], tb_group=global_intra_node_tbg, ) # Broadcast Reduced Data + broadcast_offset = total_gpus + packets_per_gpu * num_nodes for peer_gpu_id in range(gpus_per_node): peer_rank_id = peer_gpu_id + gpus_per_node * node_id @@ -161,13 +188,16 @@ def allreduce_2nodes(spec: AlgoSpec, thread_block_group_size: int) -> Collective tbg_id = peer_gpu_id if peer_gpu_id < local_gpu_id else peer_gpu_id - 1 intra_node_memory_channels[(peer_rank_id, current_rank_id)].read_put_packets( scratch_buffers[peer_rank_id][ - inter_node_offset - + local_gpu_id * packets_per_gpu : inter_node_offset + broadcast_offset + + local_gpu_id * packets_per_gpu : broadcast_offset + local_gpu_id * packets_per_gpu + packets_per_gpu ], scratch_buffers[current_rank_id][ - scratch_buffer_size - packets_per_gpu : scratch_buffer_size + inter_node_offset + + node_id * packets_per_gpu : inter_node_offset + + node_id * packets_per_gpu + + packets_per_gpu ], tb_group=thread_block_groups[tbg_id], ) @@ -181,8 +211,8 @@ def allreduce_2nodes(spec: AlgoSpec, thread_block_group_size: int) -> Collective peer_gpu_id * packets_per_gpu : peer_gpu_id * packets_per_gpu + packets_per_gpu ], scratch_buffers[current_rank_id][ - inter_node_offset - + peer_gpu_id * packets_per_gpu : inter_node_offset + broadcast_offset + + peer_gpu_id * packets_per_gpu : broadcast_offset + peer_gpu_id * packets_per_gpu + packets_per_gpu ], @@ -190,3 +220,37 @@ def allreduce_2nodes(spec: AlgoSpec, thread_block_group_size: int) -> Collective ) return prog + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument("--name", type=str, help="name of the program") + parser.add_argument("--num_gpus", type=int, help="total number of gpus") + parser.add_argument("--gpus_per_node", type=int, help="number of gpus per node") + parser.add_argument("--tbg", type=int, default=1, help="thread block group size") + parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block") + parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size") + parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size") + + args = parser.parse_args() + + spec = AlgoSpec( + name=args.name, + collective=AllReduce(args.num_gpus, 1, True), + nranks_per_node=args.gpus_per_node, + world_size=args.num_gpus, + in_place=True, + instances=1, + protocol="LL", + auto_sync=False, + num_threads_per_block=args.num_threads_per_block, + reuse_resources=True, + use_double_scratch_buffer=True, + min_message_size=args.min_message_size, + max_message_size=args.max_message_size, + ) + + prog = allreduce_multi_nodes(spec, args.tbg) + print(prog.to_json()) diff --git a/src/ext/collectives/algorithm_collection_builder.cc b/src/ext/collectives/algorithm_collection_builder.cc index 2a7e6e91..5d196d12 100644 --- a/src/ext/collectives/algorithm_collection_builder.cc +++ b/src/ext/collectives/algorithm_collection_builder.cc @@ -113,7 +113,9 @@ AlgorithmCollection AlgorithmCollectionBuilder::buildDefaultDslAlgorithms(int ra }; static const std::vector defaultAlgoConfigs = { {"allreduce_2nodes_1K_64K.json", "allreduce", 8, 16, {{"default", 1}}}, - {"allreduce_2nodes_64K_2M.json", "allreduce", 8, 16, {{"default", 1}}}}; + {"allreduce_2nodes_128K_2M.json", "allreduce", 8, 16, {{"default", 1}}}, + {"allreduce_4nodes_1K_64K.json", "allreduce", 8, 32, {{"default", 1}}}, + {"allreduce_4nodes_128K_2M.json", "allreduce", 4, 64, {{"default", 1}}}}; AlgorithmCollection collection; static auto generateFileId = [](const std::string& input) { From 40295df4c4d0b57a028f29320996f50587330656 Mon Sep 17 00:00:00 2001 From: Caio Rocha <164253795+caiomcbr@users.noreply.github.com> Date: Thu, 14 May 2026 09:56:11 -0700 Subject: [PATCH 07/12] Adding Support to bf16 Executor Tests (#801) This pull request adds support for the `bfloat16` (bf16) data type to the test executor, including both Python and CUDA components. The changes ensure that `bfloat16` is handled consistently across argument parsing, data type conversion, and test kernel implementations. Additionally, the CUDA verification kernels are refactored to use parameterized tolerances for improved numerical accuracy checks. **Support for bfloat16 data type:** * Added handling for `bfloat16`/`bf16` in the Python test executor's argument parsing, data type conversion (`parse_dtype`, `dtype_to_mscclpp_dtype`), and help text. [[1]](diffhunk://#diff-e643968a8622d1603868a8ecf4b2fcd8108be1e404a3420bb7e2a6d51dc23fdcR27-R28) [[2]](diffhunk://#diff-e643968a8622d1603868a8ecf4b2fcd8108be1e404a3420bb7e2a6d51dc23fdcL122-R135) [[3]](diffhunk://#diff-e643968a8622d1603868a8ecf4b2fcd8108be1e404a3420bb7e2a6d51dc23fdcL246-R251) * Updated output to display the correct data type string for `bfloat16`. **CUDA kernel and test improvements:** * Included `bfloat16` headers and defined test data fill and gather kernels for `bfloat16` on both CUDA and HIP platforms. [[1]](diffhunk://#diff-e18b8becff1c3b234733f5ca3250a76ffdc5edddb302c2da098b64b00ba7cf88R8-R11) [[2]](diffhunk://#diff-e18b8becff1c3b234733f5ca3250a76ffdc5edddb302c2da098b64b00ba7cf88R35) [[3]](diffhunk://#diff-e18b8becff1c3b234733f5ca3250a76ffdc5edddb302c2da098b64b00ba7cf88R54-R59) [[4]](diffhunk://#diff-e18b8becff1c3b234733f5ca3250a76ffdc5edddb302c2da098b64b00ba7cf88R133) * Refactored verification kernels (`ALL_REDUCE`, `REDUCE_SCATTER`) to use an explicit tolerance parameter (`Eps`) and added correct tolerances for each data type, including `bfloat16`. [[1]](diffhunk://#diff-e18b8becff1c3b234733f5ca3250a76ffdc5edddb302c2da098b64b00ba7cf88L69-R85) [[2]](diffhunk://#diff-e18b8becff1c3b234733f5ca3250a76ffdc5edddb302c2da098b64b00ba7cf88L94-R113) These changes ensure full support for `bfloat16` in the test executor and improve the accuracy and maintainability of the CUDA test kernels. --------- Co-authored-by: Caio Rocha --- python/test/executor_test.py | 21 ++++++++++------- python/test/executor_test_verifier.cu | 33 +++++++++++++++++++-------- 2 files changed, 36 insertions(+), 18 deletions(-) diff --git a/python/test/executor_test.py b/python/test/executor_test.py index 59bc1661..8a309de5 100644 --- a/python/test/executor_test.py +++ b/python/test/executor_test.py @@ -24,6 +24,8 @@ def parse_dtype(dtype_str): dtype_str = dtype_str.strip().lower() if dtype_str == "float16": return cp.float16 + elif dtype_str in ("bfloat16", "bf16"): + return cp.float16 # same 2-byte size; mscclpp DataType is resolved from dtype_str elif dtype_str == "float32": return cp.float32 elif dtype_str == "int32": @@ -119,15 +121,18 @@ def parse_size(size_str): return int(size_str) -def dtype_to_mscclpp_dtype(dtype): - if dtype == cp.float16: +def dtype_to_mscclpp_dtype(dtype_str): + dtype_str = dtype_str.strip().lower() + if dtype_str == "float16": return DataType.float16 - elif dtype == cp.float32: + elif dtype_str in ("bfloat16", "bf16"): + return DataType.bfloat16 + elif dtype_str == "float32": return DataType.float32 - elif dtype == cp.int32: + elif dtype_str == "int32": return DataType.int32 else: - raise ValueError(f"Unknown data type: {dtype}") + raise ValueError(f"Unknown data type: {dtype_str}") def build_bufs( @@ -205,7 +210,7 @@ def main( result_buf.data.ptr, input_buf.nbytes, result_buf.nbytes, - dtype_to_mscclpp_dtype(dtype), + dtype_to_mscclpp_dtype(dtype_str), execution_plan, stream.ptr, packet_type, @@ -231,7 +236,7 @@ def main( npkit.shutdown() print( f"Rank: {mscclpp_group.my_rank} Execution time: {execution_time} us, " - f"data size: {result_buf.nbytes} bytes data type: {dtype().dtype.name} " + f"data size: {result_buf.nbytes} bytes data type: {dtype_str} " f"packet type: {packet_type}" ) executor = None @@ -243,7 +248,7 @@ if __name__ == "__main__": parser.add_argument("-path", "--execution_plan_path", type=str, required=True) parser.add_argument("--size", type=str, required=True) parser.add_argument("--in_place", action="store_true", help="flag to define an in-place operation") - parser.add_argument("--dtype", type=str, default="float16", help="Choose from float16, float32, int32") + parser.add_argument("--dtype", type=str, default="float16", help="Choose from float16, bfloat16, float32, int32") parser.add_argument("--packet_type", type=str, default="LL16", help="Choose from LL8, LL16") parser.add_argument("--n_iters", type=int, default=10) parser.add_argument("--n_graph_iters", type=int, default=10) diff --git a/python/test/executor_test_verifier.cu b/python/test/executor_test_verifier.cu index cf3cd4a6..e7749197 100644 --- a/python/test/executor_test_verifier.cu +++ b/python/test/executor_test_verifier.cu @@ -4,8 +4,10 @@ #include #if defined(__HIP_PLATFORM_AMD__) +#include #include #else +#include #include #endif @@ -30,6 +32,7 @@ static __device__ unsigned int ranqd1(unsigned int seed) { } \ } +FILL_DATA(bfloat16, __nv_bfloat16) FILL_DATA(float16, __half) FILL_DATA(float32, float) FILL_DATA(int32, int) @@ -48,11 +51,12 @@ FILL_DATA(int32, int) } \ } +TEST_DATA_ALL_GATHER(bfloat16, __nv_bfloat16) TEST_DATA_ALL_GATHER(float16, __half) TEST_DATA_ALL_GATHER(float32, float) TEST_DATA_ALL_GATHER(int32, int) -#define TEST_DATA_ALL_REDUCE(FuncNameType, DataType) \ +#define TEST_DATA_ALL_REDUCE(FuncNameType, DataType, Eps) \ extern "C" __global__ void __launch_bounds__(1024, 1) test_data_all_reduce_##FuncNameType( \ DataType* result_buf, DataType* test_buf, size_t num_elems, int num_ranks, int my_rank, int seq) { \ for (int rank = 0; rank < num_ranks; rank++) { \ @@ -66,15 +70,19 @@ TEST_DATA_ALL_GATHER(int32, int) } \ } \ for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < num_elems; i += blockDim.x * gridDim.x) { \ - assert(abs(float(result_buf[i]) - float(test_buf[i])) < 1e-3 * num_ranks); \ + float expected = float(test_buf[i]); \ + float result = float(result_buf[i]); \ + float tol = Eps * num_ranks * (1.0f + abs(expected)); \ + assert(abs(result - expected) <= tol); \ } \ } -TEST_DATA_ALL_REDUCE(float16, __half) -TEST_DATA_ALL_REDUCE(float32, float) -TEST_DATA_ALL_REDUCE(int32, int) +TEST_DATA_ALL_REDUCE(bfloat16, __nv_bfloat16, 7.8125e-3f) +TEST_DATA_ALL_REDUCE(float16, __half, 9.765625e-4f) +TEST_DATA_ALL_REDUCE(float32, float, 1.1920929e-7f) +TEST_DATA_ALL_REDUCE(int32, int, 0.0f) -#define TEST_DATA_REDUCE_SCATTER(FuncNameType, DataType) \ +#define TEST_DATA_REDUCE_SCATTER(FuncNameType, DataType, Eps) \ extern "C" __global__ void __launch_bounds__(1024, 1) test_data_reduce_scatter_##FuncNameType( \ DataType* result_buf, DataType* test_buf, size_t num_elems, int num_ranks, int my_rank, int seq) { \ int nem_elems_per_rank = num_elems / num_ranks; \ @@ -91,14 +99,18 @@ TEST_DATA_ALL_REDUCE(int32, int) } \ for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < num_elems; i += blockDim.x * gridDim.x) { \ if (i >= offset && i < offset + nem_elems_per_rank) { \ - assert(abs(float(result_buf[i - offset]) - float(test_buf[i])) < 1e-3 * num_ranks); \ + float expected = float(test_buf[i]); \ + float result = float(result_buf[i - offset]); \ + float tol = Eps * num_ranks * (1.0f + abs(expected)); \ + assert(abs(result - expected) <= tol); \ } \ } \ } -TEST_DATA_REDUCE_SCATTER(float16, __half) -TEST_DATA_REDUCE_SCATTER(float32, float) -TEST_DATA_REDUCE_SCATTER(int32, int) +TEST_DATA_REDUCE_SCATTER(bfloat16, __nv_bfloat16, 7.8125e-3f) +TEST_DATA_REDUCE_SCATTER(float16, __half, 9.765625e-4f) +TEST_DATA_REDUCE_SCATTER(float32, float, 1.1920929e-7f) +TEST_DATA_REDUCE_SCATTER(int32, int, 0.0f) #define TEST_DATA_ALL_TO_ALL(FuncNameType, DataType) \ extern "C" __global__ void __launch_bounds__(1024, 1) test_data_all_to_all_##FuncNameType( \ @@ -118,6 +130,7 @@ TEST_DATA_REDUCE_SCATTER(int32, int) } \ } +TEST_DATA_ALL_TO_ALL(bfloat16, __nv_bfloat16) TEST_DATA_ALL_TO_ALL(float16, __half) TEST_DATA_ALL_TO_ALL(float32, float) TEST_DATA_ALL_TO_ALL(int32, int) \ No newline at end of file From 5d608feaa52f83e7b556fde647a45b63eb119047 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Thu, 14 May 2026 14:06:12 -0700 Subject: [PATCH 08/12] Enhance cross-node CudaIpc availability check (#803) --- include/mscclpp/utils.hpp | 9 +++++++++ src/core/connection.cc | 11 +++++++++++ src/core/context.cc | 17 ++++++++-------- src/core/gpu_ipc_mem.cc | 27 ++++++++++++++++++++++---- test/mp_unit/port_channel_tests.cu | 31 +++++++++++++++++++++++++++--- 5 files changed, 79 insertions(+), 16 deletions(-) diff --git a/include/mscclpp/utils.hpp b/include/mscclpp/utils.hpp index ffe269da..54cfa4a0 100644 --- a/include/mscclpp/utils.hpp +++ b/include/mscclpp/utils.hpp @@ -46,6 +46,15 @@ std::string getIBDeviceName(Transport ibTransport); /// @return The InfiniBand transport associated with the specified device name. Transport getIBTransportByDeviceName(const std::string& ibDeviceName); +/// Check whether this process can allocate/import CUDA memory with NVIDIA fabric handles +/// (`CU_MEM_HANDLE_TYPE_FABRIC`). Fabric handles enable cross-node `Transport::CudaIpc` on +/// MNNVL systems (e.g., GB200 NVL72) when the IMEX service is running. Returns `false` on +/// hardware/software stacks without MNNVL+IMEX, in which case `Transport::CudaIpc` is +/// restricted to ranks within the same node. +/// +/// @return `true` if fabric handles are usable from this process, `false` otherwise. +bool isFabricMemHandleAvailable(); + } // namespace mscclpp #endif // MSCCLPP_UTILS_HPP_ diff --git a/src/core/connection.cc b/src/core/connection.cc index 8b6c0afb..11ecd968 100644 --- a/src/core/connection.cc +++ b/src/core/connection.cc @@ -82,6 +82,17 @@ MSCCLPP_API_CPP int Connection::getMaxWriteQueueSize() const { return impl_->get CudaIpcConnection::CudaIpcConnection(std::shared_ptr context, const Endpoint& localEndpoint, const Endpoint& remoteEndpoint) : BaseConnection(context, localEndpoint) { + // Log fabric/MNNVL availability exactly once per process so any later cross-node CudaIpc failure + // is easy to triage. C++11 magic statics make this thread-safe without an explicit mutex. + // NOTE: assigning the message to a std::string first avoids the logger's pointer-formatting + // overload from kicking in on the const char* result of the ternary. + [[maybe_unused]] static const bool fabricAvailable_ = []() { + const bool avail = isFabricMemHandleAvailable(); + const std::string status = avail ? "available (cross-node CudaIpc via MNNVL/IMEX is supported)" + : "NOT available (CudaIpc is restricted to intra-node ranks on this system)"; + INFO(CONN, "CudaIpc transport selected: fabric handles ", status); + return avail; + }(); if (localEndpoint.transport() != Transport::CudaIpc || remoteEndpoint.transport() != Transport::CudaIpc) { THROW(CONN, Error, ErrorCode::InternalError, "CudaIpc transport is required for CudaIpcConnection"); } diff --git a/src/core/context.cc b/src/core/context.cc index aabe71df..b55939e3 100644 --- a/src/core/context.cc +++ b/src/core/context.cc @@ -4,12 +4,11 @@ #include "context.hpp" #include -#include #include "api.h" #include "connection.hpp" -#include "debug.h" #include "endpoint.hpp" +#include "logger.hpp" #include "registered_memory.hpp" namespace mscclpp { @@ -78,19 +77,17 @@ MSCCLPP_API_CPP Endpoint Context::createEndpoint(EndpointConfig config) { MSCCLPP_API_CPP Connection Context::connect(const Endpoint& localEndpoint, const Endpoint& remoteEndpoint) { if (localEndpoint.device().type == DeviceType::GPU && localEndpoint.device().id < 0) { - throw Error("No GPU device ID provided for local endpoint", ErrorCode::InvalidUsage); + THROW(CONN, Error, ErrorCode::InvalidUsage, "No GPU device ID provided for local endpoint"); } if (remoteEndpoint.device().type == DeviceType::GPU && remoteEndpoint.device().id < 0) { - throw Error("No GPU device ID provided for remote endpoint", ErrorCode::InvalidUsage); + THROW(CONN, Error, ErrorCode::InvalidUsage, "No GPU device ID provided for remote endpoint"); } auto localTransport = localEndpoint.transport(); auto remoteTransport = remoteEndpoint.transport(); if (localTransport != remoteTransport && !(AllIBTransports.has(localTransport) && AllIBTransports.has(remoteTransport))) { - std::stringstream ss; - ss << "Transport mismatch between local (" << localTransport << ") and remote (" << remoteEndpoint.transport() - << ") endpoints"; - throw Error(ss.str(), ErrorCode::InvalidUsage); + THROW(CONN, Error, ErrorCode::InvalidUsage, "Transport mismatch between local (", localTransport, ") and remote (", + remoteTransport, ") endpoints"); } std::shared_ptr conn; if (localTransport == Transport::CudaIpc) { @@ -100,7 +97,9 @@ MSCCLPP_API_CPP Connection Context::connect(const Endpoint& localEndpoint, const } else if (localTransport == Transport::Ethernet) { conn = std::make_shared(shared_from_this(), localEndpoint, remoteEndpoint); } else { - throw Error("Unsupported transport", ErrorCode::InternalError); + THROW(CONN, Error, ErrorCode::InternalError, "Unsupported transport: ", localTransport, + " (this usually means EndpointConfig.transport was left at Transport::Unknown — " + "set it explicitly to CudaIpc, an IB transport, or Ethernet)"); } return Connection(conn); } diff --git a/src/core/gpu_ipc_mem.cc b/src/core/gpu_ipc_mem.cc index c863ecdd..0f58ed20 100644 --- a/src/core/gpu_ipc_mem.cc +++ b/src/core/gpu_ipc_mem.cc @@ -7,6 +7,7 @@ #include #include +#include #include "logger.hpp" #include "unix_socket.hpp" @@ -35,7 +36,7 @@ std::ostream& operator<<(std::ostream& os, const GpuIpcMemHandle::TypeFlags& typ return os; } -[[maybe_unused]] static bool isFabricMemHandleAvailable() { +bool isFabricMemHandleAvailable() { #if (CUDA_NVLS_API_AVAILABLE) static int resultCache = -1; // -1: uninitialized, 0: not available, 1: available if (resultCache != -1) { @@ -283,11 +284,19 @@ GpuIpcMem::GpuIpcMem(const GpuIpcMemHandle& handle) THROW(GPU, Error, ErrorCode::InvalidUsage, "GpuIpcMemHandle type is None, cannot create GpuIpcMem"); } if ((type_ == GpuIpcMemHandle::Type::None) && (handle_.typeFlags & GpuIpcMemHandle::Type::Fabric)) { - if (cuMemImportFromShareableHandle(&allocHandle_, (void*)handle_.fabric.handle, CU_MEM_HANDLE_TYPE_FABRIC) == - CUDA_SUCCESS) { + CUresult res = + cuMemImportFromShareableHandle(&allocHandle_, (void*)handle_.fabric.handle, CU_MEM_HANDLE_TYPE_FABRIC); + if (res == CUDA_SUCCESS) { // Ignore allocHandle in the handle struct since it is process-local and not transferable across processes. handle_.fabric.allocHandle = {}; type_ = GpuIpcMemHandle::Type::Fabric; + } else { + const char* errStr = nullptr; + (void)cuGetErrorString(res, &errStr); + const std::string errMsg = errStr ? std::string(errStr) : std::string("unknown CUDA error"); + WARN(GPU, "Fabric IPC handle import failed (", errMsg, + "); cross-node CudaIpc requires NVIDIA MNNVL hardware and a running IMEX service. ", + "Falling back to other handle types if available."); } } if ((type_ == GpuIpcMemHandle::Type::None) && (handle_.typeFlags & GpuIpcMemHandle::Type::PosixFd)) { @@ -303,7 +312,17 @@ GpuIpcMem::GpuIpcMem(const GpuIpcMemHandle& handle) type_ = GpuIpcMemHandle::Type::RuntimeIpc; } if (type_ == GpuIpcMemHandle::Type::None) { - THROW(GPU, Error, ErrorCode::Aborted, "Failed to open GpuIpcMemHandle (type: ", handle_.typeFlags, ")"); + const bool fabricOnly = (handle_.typeFlags == GpuIpcMemHandle::Type::Fabric); + const std::string hint = fabricOnly + ? std::string( + "The remote rank sent only a Fabric (MNNVL) handle, but this rank could not " + "import it. Check that the IMEX daemon is running on both nodes and that the " + "GPUs share an NVLink fabric.") + : std::string( + "All handle types failed to import; check IMEX service and POSIX FD socket " + "availability."); + THROW(GPU, Error, ErrorCode::Aborted, "Failed to open GpuIpcMemHandle (offered types: ", handle_.typeFlags, "). ", + hint); } } diff --git a/test/mp_unit/port_channel_tests.cu b/test/mp_unit/port_channel_tests.cu index 3b14ed31..4b1b0cfb 100644 --- a/test/mp_unit/port_channel_tests.cu +++ b/test/mp_unit/port_channel_tests.cu @@ -36,6 +36,18 @@ inline void requireGdrForIbMode(IbMode mode, mscclpp::Transport ibTransport) { #define REQUIRE_GDR_FOR_IB_MODE(mode) // No extra requirements on non-CUDA platforms. #endif +// Skip an IPC-only PortChannel test (useIPC=true, useIB=false, useEthernet=false) when CudaIpc +// cannot connect this rank pair. CudaIpc works intra-node always, and cross-node only on MNNVL +// systems (GB200 NVL72 + IMEX). The combined check is "at least 2 ranks per node" OR "fabric +// (MNNVL) handles are usable on this system". +#define REQUIRE_CUDA_IPC_AVAILABLE \ + do { \ + if (gEnv->nRanksPerNode < 2 && !mscclpp::isFabricMemHandleAvailable()) { \ + SKIP_TEST() << "CudaIpc requires intra-node ranks (nRanksPerNode>=2) or MNNVL fabric handles, \ +both unavailable here."; \ + } \ + } while (0) + void PortChannelOneToOneTest::SetUp() { // Use only two ranks setNumRanksToUse(2); @@ -71,7 +83,10 @@ void PortChannelOneToOneTest::setupMeshConnections(std::vectorrank)) && useIPC) { + if (useIPC) { + // CudaIpc works intra-node always, and cross-node on MNNVL systems (GB200 NVL72 + IMEX) + // via fabric handles. Tests that exercise CudaIpc across nodes on non-MNNVL hardware should + // gate themselves with REQUIRE_CUDA_IPC_AVAILABLE; we always request CudaIpc here when asked. cfg.transport = mscclpp::Transport::CudaIpc; } else if (useIb) { cfg.transport = ibTransport; @@ -262,6 +277,7 @@ void PortChannelOneToOneTest::testPingPongPerf(PingPongTestParams params) { } TEST(PortChannelOneToOneTest, PingPong) { + REQUIRE_CUDA_IPC_AVAILABLE; testPingPong(PingPongTestParams{ .useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Default}); } @@ -279,6 +295,7 @@ TEST(PortChannelOneToOneTest, PingPongEthernet) { } TEST(PortChannelOneToOneTest, PingPongWithPoll) { + REQUIRE_CUDA_IPC_AVAILABLE; testPingPong(PingPongTestParams{ .useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = true, .ibMode = IbMode::Default}); } @@ -291,6 +308,7 @@ TEST(PortChannelOneToOneTest, PingPongIbHostModeWithPoll) { } PERF_TEST(PortChannelOneToOneTest, PingPongPerf) { + REQUIRE_CUDA_IPC_AVAILABLE; testPingPongPerf(PingPongTestParams{ .useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Default}); } @@ -482,7 +500,10 @@ void PortChannelOneToOneTest::testPacketPingPongPerf(bool useIb, IbMode ibMode) proxyService->stopProxy(); } -TEST(PortChannelOneToOneTest, PacketPingPong) { testPacketPingPong(false, IbMode::Default); } +TEST(PortChannelOneToOneTest, PacketPingPong) { + REQUIRE_CUDA_IPC_AVAILABLE; + testPacketPingPong(false, IbMode::Default); +} TEST(PortChannelOneToOneTest, PacketPingPongIbHostMode) { REQUIRE_IBVERBS; @@ -490,7 +511,10 @@ TEST(PortChannelOneToOneTest, PacketPingPongIbHostMode) { testPacketPingPong(true, IbMode::Host); } -PERF_TEST(PortChannelOneToOneTest, PacketPingPongPerf) { testPacketPingPongPerf(false, IbMode::Default); } +PERF_TEST(PortChannelOneToOneTest, PacketPingPongPerf) { + REQUIRE_CUDA_IPC_AVAILABLE; + testPacketPingPongPerf(false, IbMode::Default); +} PERF_TEST(PortChannelOneToOneTest, PacketPingPongPerfIbHostMode) { REQUIRE_IBVERBS; @@ -583,6 +607,7 @@ void PortChannelOneToOneTest::testBandwidth(PingPongTestParams params) { } PERF_TEST(PortChannelOneToOneTest, Bandwidth) { + REQUIRE_CUDA_IPC_AVAILABLE; testBandwidth(PingPongTestParams{ .useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Default}); } From 5911998181a60eeb546300843bb1f76f504df97d Mon Sep 17 00:00:00 2001 From: Qinghua Zhou Date: Thu, 14 May 2026 21:29:10 +0000 Subject: [PATCH 09/12] ext/ep: gate NVLS HT B2 on cross-host fabric IPC support (H100 fix) The NVLS HT B2 path introduced in 3ab2e43b activated whenever isNvlsSupported() && num_rdma_ranks > 1. On H100 NDv5 / Azure CX-7 RoCE that is true (H100 has intra-node NVLink multicast), but there is no cross-host NVSwitch fabric. mscclpp's GpuIpcMem::create then falls back to CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR whose handle exchange routes through /tmp/mscclpp_bootstrap_.sock -- a master-rank-0 unix-domain socket worker ranks cannot reach. Symptom on every commit since 3ab2e43b: RuntimeError: connect() failed for unix socket to /tmp/mscclpp_bootstrap_.sock MSCCLPP_EP_FABRIC_IPC=0 was being silently ignored. src/ext/ep/buffer.cc: add resolve_fabric_ipc_supported() helper. Resolution: 1. MSCCLPP_EP_FABRIC_IPC env var (0/off/false/no => off, 1/on/true/yes/force => on, otherwise auto). 2. Auto-detect: requires both - CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED == 1 - device compute capability >= sm_100 (Blackwell+). Gate both use_fabric_ipc_alloc (RDMA buffer allocator) and nvls_ht_enabled (HT B2 multicast region) on fabric_ipc_supported. On H100 both fall back to cudaMalloc + legacy PortChannel; on GB200 NVL72 both remain enabled. Diagnostic prints now show fabric_ipc=. test/python/ext/ep/test_internode_multirank.py: replace hardcoded NUM_MAX_NVL_PEERS=4 with a runtime _detect_local_world_size() helper that reads MSCCLPP_EP_LOCAL_WORLD_SIZE / LOCAL_WORLD_SIZE / OMPI_COMM_WORLD_LOCAL_SIZE, falling back to torch.cuda.device_count(). Makes the test correct on both H100 (8 GPUs/node) and GB200 (4 GPUs/node) without code changes. src/core/atomicadd_kernel.cu: use cuCtxCreate_v4 for CUDA >= 12.5 (the underlying symbol was renamed); preserve legacy 3-arg cuCtxCreate for older toolkits. Verified on 2x H100 NDv5 at HEAD: LL intranode (8 GPUs) PASS LL internode (16 GPUs, 2 nodes) PASS HT intranode (8 GPUs) PASS HT internode (16 GPUs, 2 nodes) PASS Diagnostic on H100: [mscclpp_ep] rdma_buffer allocator: cudaMalloc (low_latency=0, nvls=1, fabric_ipc=0) [mscclpp_ep] NVLS HT multicast: disabled (low_latency=0, num_rdma_ranks=2, nvls_supported=1, fabric_ipc=0) --- src/core/atomicadd_kernel.cu | 4 +- src/ext/ep/buffer.cc | 79 +++++++++++++++++-- .../python/ext/ep/test_internode_multirank.py | 22 +++++- 3 files changed, 94 insertions(+), 11 deletions(-) diff --git a/src/core/atomicadd_kernel.cu b/src/core/atomicadd_kernel.cu index 59e660d7..21427df6 100644 --- a/src/core/atomicadd_kernel.cu +++ b/src/core/atomicadd_kernel.cu @@ -41,9 +41,9 @@ void CudaIpcStream::atomicAdd(uint64_t* dst, int64_t value) { // 4-arg form on new toolkits, fall back to the legacy 3-arg form on // CUDA < 12.5 so we keep compiling against older drivers/toolkits. #if CUDA_VERSION >= 12050 - res = cuCtxCreate(&proxyAtomicCtx_, NULL, 0, cuDevice); + res = cuCtxCreate_v4(&proxyAtomicCtx_, NULL, 0, cuDevice); #else - res = cuCtxCreate(&proxyAtomicCtx_, 0, cuDevice); + res = cuCtxCreate(&proxyAtomicCtx_vice); #endif if (res != CUDA_SUCCESS) throw Error("cuCtxCreate failed", ErrorCode::InternalError); diff --git a/src/ext/ep/buffer.cc b/src/ext/ep/buffer.cc index 74aa7922..02b38c36 100644 --- a/src/ext/ep/buffer.cc +++ b/src/ext/ep/buffer.cc @@ -59,6 +59,56 @@ static int resolve_num_proxy_services() { return 8; } +// Cross-host cuMem fabric IPC capability. +// +// `isNvlsSupported()` returns true for any device with NVLink multicast, +// including H100. But NVLS by itself only works inside one host's NVLink +// island; cross-host sharing of cuMem allocations / NVLS multicast handles +// requires the device to be on an actual cross-host NVLink fabric (GB200 +// NVL72 with nvidia-imex on Azure today). H100 reports +// `CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED == 1` too but lacks +// the NVSwitch fabric to actually share fabric handles across hosts; the +// cross-host import then falls back to POSIX-FD, whose handle exchange +// routes through a unix-domain socket on the master host -- which +// worker-node ranks cannot reach (`connect() failed for unix socket to +// /tmp/mscclpp_bootstrap_*.sock`). That is the exact failure signature +// commit 3ab2e43b ("NVLS HT B2 phases 1-3") introduced on H100 / Azure +// CX-7 RoCE. +// +// To stay safe by default we require both: +// - device attr `CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED` +// - compute capability >= sm_100 (Blackwell+). +// +// Resolution order: +// 1. `MSCCLPP_EP_FABRIC_IPC` env var (`0`/`off`/`false`/`no` => off, +// `1`/`on`/`true`/`yes`/`force` => on, anything else => auto). When +// set, env value takes precedence over the device check. +// 2. Auto-detect via the two checks above. +static bool resolve_fabric_ipc_supported() { + if (const char* env = std::getenv("MSCCLPP_EP_FABRIC_IPC")) { + std::string v(env); + for (auto& c : v) c = std::tolower(static_cast(c)); + if (v == "0" || v == "off" || v == "false" || v == "no") return false; + if (v == "1" || v == "on" || v == "true" || v == "yes" || v == "force") return true; + } + int dev = 0; + if (cudaGetDevice(&dev) != cudaSuccess) return false; + CUdevice cuDev; + if (cuDeviceGet(&cuDev, dev) != CUDA_SUCCESS) return false; + int supported = 0; + if (cuDeviceGetAttribute(&supported, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, cuDev) != CUDA_SUCCESS) { + return false; + } + if (!supported) return false; + cudaDeviceProp prop{}; + if (cudaGetDeviceProperties(&prop, dev) != cudaSuccess) return false; + // Blackwell+ (sm_100, GB200 NVL72) is the only deployed cross-host + // NVLink fabric today. H100 (sm_90) advertises fabric-handle support + // but lacks the nvidia-imex / NVSwitch fabric to actually share them + // across hosts. + return prop.major >= 10; +} + Buffer::Buffer(int rank, int num_ranks, int64_t num_nvl_bytes, int64_t num_rdma_bytes, bool low_latency_mode) : rank(rank), num_ranks(num_ranks), @@ -346,8 +396,16 @@ void Buffer::sync(const std::vector& device_ids, // fabric-IPC so cross-node `handle.put(data)` can be replaced by // direct kernel-side writes via NVL72 fabric pointers — bypassing // the broken Azure CX-7 RoCE RDMA WRITE path. + // + // NVLS-supported but FABRIC-unsupported deployments (e.g. H100 on + // Azure NDv5) must not take this path for the HT cross-host case: + // their `gpuCallocPhysical` result is a POSIX-FD-only handle whose + // cross-host import falls back to a master-local unix socket which + // worker ranks cannot reach. + static const bool fabric_ipc_supported = resolve_fabric_ipc_supported(); const bool use_fabric_ipc_alloc = - mscclpp::isNvlsSupported() && (low_latency_mode || num_rdma_ranks > 1); + mscclpp::isNvlsSupported() && fabric_ipc_supported && + (low_latency_mode || num_rdma_ranks > 1); if (use_fabric_ipc_alloc) { rdma_buffer_ptr = mscclpp::detail::gpuCallocPhysical(num_rdma_bytes); CUDA_CHECK(cudaMemset(rdma_buffer_ptr, 0, num_rdma_bytes)); @@ -356,9 +414,9 @@ void Buffer::sync(const std::vector& device_ids, CUDA_CHECK(cudaMemset(rdma_buffer_ptr, 0, num_rdma_bytes)); } if (rank == 0) { - printf("[mscclpp_ep] rdma_buffer allocator: %s (low_latency=%d, nvls=%d)\n", + printf("[mscclpp_ep] rdma_buffer allocator: %s (low_latency=%d, nvls=%d, fabric_ipc=%d)\n", use_fabric_ipc_alloc ? "gpuCallocPhysical (fabric-IPC)" : "cudaMalloc", - (int)low_latency_mode, (int)mscclpp::isNvlsSupported()); + (int)low_latency_mode, (int)mscclpp::isNvlsSupported(), (int)fabric_ipc_supported); fflush(stdout); } bootstrap->barrier(); @@ -501,12 +559,20 @@ void Buffer::sync(const std::vector& device_ids, // or there is only one RDMA rank (intranode-only), `nvls_ht_enabled` // stays `false` and kernels select the legacy PortChannel path. // + // Additionally: NVLS alone is insufficient for cross-host. H100 has + // NVLS within a node but no cross-host NVSwitch fabric, so + // `connectNvlsCollective` either fails or builds a per-node-only + // multicast object, and the POSIX-FD fallback for handle exchange + // routes through a master-local unix socket that worker ranks cannot + // reach. Gate this path on `fabric_ipc_supported` so non-Blackwell + // deployments cleanly use the legacy PortChannel path. + // // Skipped for `low_latency_mode` since LL has its own (working) // fabric-IPC path via Proposal A and does not use the HT counter // protocol. // ------------------------------------------------------------------ nvls_ht_enabled = false; - if (!low_latency_mode && num_rdma_ranks > 1 && mscclpp::isNvlsSupported()) { + if (!low_latency_mode && num_rdma_ranks > 1 && mscclpp::isNvlsSupported() && fabric_ipc_supported) { // Worst-case sizing — chosen so the same multicast buffer fits any // (num_sms, num_rdma_ranks) configuration the kernels may launch with. const size_t kCounterBytesPerChannel = @@ -576,8 +642,9 @@ void Buffer::sync(const std::vector& device_ids, } else if (rank == 0) { printf( "[mscclpp_ep] NVLS HT multicast: disabled (low_latency=%d, num_rdma_ranks=%d, " - "nvls_supported=%d)\n", - (int)low_latency_mode, num_rdma_ranks, (int)mscclpp::isNvlsSupported()); + "nvls_supported=%d, fabric_ipc=%d)\n", + (int)low_latency_mode, num_rdma_ranks, (int)mscclpp::isNvlsSupported(), + (int)fabric_ipc_supported); fflush(stdout); } diff --git a/test/python/ext/ep/test_internode_multirank.py b/test/python/ext/ep/test_internode_multirank.py index eddfb6be..b7ae2e8d 100644 --- a/test/python/ext/ep/test_internode_multirank.py +++ b/test/python/ext/ep/test_internode_multirank.py @@ -41,10 +41,26 @@ import torch import torch.distributed as dist +def _detect_local_world_size(): + """Number of GPUs per node (4 on GB200, 8 on H100/A100, etc.). + + Resolution order: + 1. `MSCCLPP_EP_LOCAL_WORLD_SIZE` env var (matches the C++ side). + 2. `LOCAL_WORLD_SIZE` (torchrun) or `OMPI_COMM_WORLD_LOCAL_SIZE` (mpirun). + 3. `torch.cuda.device_count()` on the current host. + """ + for var in ("MSCCLPP_EP_LOCAL_WORLD_SIZE", "LOCAL_WORLD_SIZE", "OMPI_COMM_WORLD_LOCAL_SIZE"): + v = os.environ.get(var) + if v and int(v) > 0: + return int(v) + return max(1, torch.cuda.device_count()) + + def init_dist(): rank = int(os.environ["RANK"]) world_size = int(os.environ["WORLD_SIZE"]) - local_rank = int(os.environ.get("LOCAL_RANK", rank % 4)) + local_world_size = _detect_local_world_size() + local_rank = int(os.environ.get("LOCAL_RANK", rank % local_world_size)) torch.cuda.set_device(local_rank) dist.init_process_group( backend="nccl", world_size=world_size, rank=rank, device_id=torch.device(f"cuda:{local_rank}") @@ -71,10 +87,10 @@ def main(): rank, num_ranks, local_rank, group = init_dist() from mscclpp.ext import ep - NUM_MAX_NVL_PEERS = 4 + NUM_MAX_NVL_PEERS = _detect_local_world_size() assert ( num_ranks % NUM_MAX_NVL_PEERS == 0 and num_ranks > NUM_MAX_NVL_PEERS - ), f"expected >1 node with 8 GPUs each, got num_ranks={num_ranks}" + ), f"expected >1 node with {NUM_MAX_NVL_PEERS} GPUs each, got num_ranks={num_ranks}" num_nodes = num_ranks // NUM_MAX_NVL_PEERS num_local_ranks = NUM_MAX_NVL_PEERS From 252a422030c092163b8c75577b7223ca264f2443 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Fri, 15 May 2026 11:50:43 -0700 Subject: [PATCH 10/12] Handle PortChannel flush asynchronously from the host proxy (#802) When a `PortChannel` requests `flush()`, the host-side proxy was being blocked, which may cause head-of-line blocking of other parallel `PortChannel`s' requests. Now the proxy handles `flush()` requests asynchronously. This feature especially helps performance when we need multiple IB QPs and need to flush QPs. --- include/mscclpp/core.hpp | 2 + include/mscclpp/fifo.hpp | 5 + include/mscclpp/port_channel.hpp | 4 + include/mscclpp/port_channel_device.hpp | 33 ++- include/mscclpp/proxy.hpp | 4 +- src/core/connection.cc | 36 ++- src/core/fifo.cc | 2 + src/core/include/atomic.hpp | 16 +- src/core/include/connection.hpp | 25 ++ src/core/include/proxy_impl.hpp | 38 ++++ src/core/port_channel.cc | 64 +++++- src/core/proxy.cc | 34 +-- test/mp_unit/mp_unit_tests.hpp | 8 + test/mp_unit/port_channel_tests.cu | 289 ++++++++++++++++++++++++ 14 files changed, 507 insertions(+), 53 deletions(-) create mode 100644 src/core/include/proxy_impl.hpp diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp index ca2fc34f..45b56bcc 100644 --- a/include/mscclpp/core.hpp +++ b/include/mscclpp/core.hpp @@ -676,6 +676,8 @@ class Connection { friend class Semaphore; friend class ProxyService; friend class BaseConnection; + friend struct BasePortChannel; + friend struct PortChannel; }; /// SemaphoreStub object only used for constructing Semaphore, not for direct use by the user. diff --git a/include/mscclpp/fifo.hpp b/include/mscclpp/fifo.hpp index 2ee793ad..6aae03b5 100644 --- a/include/mscclpp/fifo.hpp +++ b/include/mscclpp/fifo.hpp @@ -29,6 +29,11 @@ class Fifo { /// Remove the head trigger. void pop(); + /// Get the current tail position — the FIFO push-return value of the trigger about to be + /// (or currently being) processed by the proxy thread. Monotonically increasing. + /// @return The current tail position. + uint64_t tail() const; + /// Get FIFO size. /// @return Number of entries in the FIFO. int size() const; diff --git a/include/mscclpp/port_channel.hpp b/include/mscclpp/port_channel.hpp index ed660407..18d67524 100644 --- a/include/mscclpp/port_channel.hpp +++ b/include/mscclpp/port_channel.hpp @@ -84,8 +84,12 @@ class ProxyService : public BaseProxyService { std::vector memories_; std::shared_ptr proxy_; std::unordered_map, int> inflightRequests_; + // Latest pending TriggerSync FIFO position per connection. Proxy publishes pos+1 to the + // connection's gpuFlushDonePos_ when the CQ drains, then erases the entry. + std::unordered_map, uint64_t> pendingFlushPos_; ProxyHandlerResult handleTrigger(ProxyTrigger triggerRaw); + void progressFlushes(); }; /// Port channel without specifying source/destination memory regions. diff --git a/include/mscclpp/port_channel_device.hpp b/include/mscclpp/port_channel_device.hpp index adff3fcd..74fa3d89 100644 --- a/include/mscclpp/port_channel_device.hpp +++ b/include/mscclpp/port_channel_device.hpp @@ -17,6 +17,19 @@ using SemaphoreId = uint32_t; /// actual. using MemoryId = uint32_t; +namespace detail { +#if defined(MSCCLPP_DEVICE_COMPILE) +/// Wait until the proxy has processed and drained the TriggerSync at FIFO position `fifoPos`. +/// The proxy publishes `flushDonePos = latestCompletedPos + 1` when the CQ drains, so the +/// wait condition `flushDonePos > fifoPos` is satisfied exactly when our own request has +/// been completed. Using the FIFO push position as the wait target couples the wait to the +/// FIFO order, avoiding races when multiple GPU threads concurrently flush the same channel. +MSCCLPP_DEVICE_INLINE void waitFlush(uint64_t* flushDonePos, uint64_t fifoPos, [[maybe_unused]] int64_t maxSpinCount) { + POLL_MAYBE_JAILBREAK((atomicLoad(flushDonePos, memoryOrderAcquire) <= fifoPos), maxSpinCount); +} +#endif // defined(MSCCLPP_DEVICE_COMPILE) +} // namespace detail + struct BasePortChannelDeviceHandle { SemaphoreId semaphoreId_; @@ -26,12 +39,16 @@ struct BasePortChannelDeviceHandle { // can produce for and the sole proxy thread consumes it. FifoDeviceHandle fifo_; + // One past the highest FIFO position with a completed flush on this connection. + // Host-pinned: proxy writes after CQ drain, GPU reads in waitFlush(). + uint64_t* flushDonePos_; + MSCCLPP_INLINE BasePortChannelDeviceHandle() = default; MSCCLPP_HOST_DEVICE_INLINE BasePortChannelDeviceHandle(SemaphoreId semaphoreId, Host2DeviceSemaphoreDeviceHandle semaphore, - FifoDeviceHandle fifo) - : semaphoreId_(semaphoreId), semaphore_(semaphore), fifo_(fifo) {} + FifoDeviceHandle fifo, uint64_t* flushDonePos) + : semaphoreId_(semaphoreId), semaphore_(semaphore), fifo_(fifo), flushDonePos_(flushDonePos) {} #if defined(MSCCLPP_DEVICE_COMPILE) /// Push a TriggerData to the FIFO. @@ -86,9 +103,9 @@ struct BasePortChannelDeviceHandle { /// @param maxSpinCount The maximum number of spin counts before asserting. Never assert if negative. MSCCLPP_DEVICE_INLINE void putWithSignalAndFlush(MemoryId dstId, uint64_t dstOffset, MemoryId srcId, uint64_t srcOffset, uint64_t size, int64_t maxSpinCount = 1000000) { - uint64_t curFifoHead = + uint64_t pos = fifo_.push({TriggerData | TriggerFlag | TriggerSync, dstId, dstOffset, srcId, srcOffset, size, semaphoreId_}); - fifo_.sync(curFifoHead, maxSpinCount); + detail::waitFlush(flushDonePos_, pos, maxSpinCount); } /// Push a TriggerData, a TriggerFlag, and a TriggerSync at the same time to the FIFO. @@ -105,8 +122,8 @@ struct BasePortChannelDeviceHandle { /// Push a TriggerSync to the FIFO. /// @param maxSpinCount The maximum number of spin counts before asserting. Never assert if negative. MSCCLPP_DEVICE_INLINE void flush(int64_t maxSpinCount = 1000000) { - uint64_t curFifoHead = fifo_.push({TriggerSync, 0, 0, 0, 0, 1, semaphoreId_}); - fifo_.sync(curFifoHead, maxSpinCount); + uint64_t pos = fifo_.push({TriggerSync, 0, 0, 0, 0, 1, semaphoreId_}); + detail::waitFlush(flushDonePos_, pos, maxSpinCount); } /// Check if the port channel has been signaled. @@ -128,8 +145,8 @@ struct PortChannelDeviceHandle : public BasePortChannelDeviceHandle { MSCCLPP_HOST_DEVICE_INLINE PortChannelDeviceHandle(SemaphoreId semaphoreId, Host2DeviceSemaphoreDeviceHandle semaphore, FifoDeviceHandle fifo, - MemoryId dst, MemoryId src) - : BasePortChannelDeviceHandle(semaphoreId, semaphore, fifo), dst_(dst), src_(src) {} + MemoryId dst, MemoryId src, uint64_t* flushDonePos) + : BasePortChannelDeviceHandle(semaphoreId, semaphore, fifo, flushDonePos), dst_(dst), src_(src) {} #if defined(MSCCLPP_DEVICE_COMPILE) /// Push a TriggerData to the FIFO. diff --git a/include/mscclpp/proxy.hpp b/include/mscclpp/proxy.hpp index 990deabb..291206c0 100644 --- a/include/mscclpp/proxy.hpp +++ b/include/mscclpp/proxy.hpp @@ -20,8 +20,9 @@ enum class ProxyHandlerResult { }; class Proxy; +class ProxyService; -/// Handler function type for proxy. +/// Handler function type for proxy. Called once per ready FIFO trigger. using ProxyHandler = std::function; /// Host-side proxy for PortChannels. @@ -54,6 +55,7 @@ class Proxy { std::shared_ptr fifo(); private: + friend class ProxyService; struct Impl; std::unique_ptr pimpl_; }; diff --git a/src/core/connection.cc b/src/core/connection.cc index 11ecd968..e01c4a6e 100644 --- a/src/core/connection.cc +++ b/src/core/connection.cc @@ -7,13 +7,13 @@ #include #endif -#include #include #include #include #include #include "api.h" +#include "atomic.hpp" #include "context.hpp" #include "endpoint.hpp" #include "gpu_utils_internal.hpp" @@ -43,7 +43,10 @@ const RegisteredMemory::Impl& BaseConnection::getImpl(const RegisteredMemory& me Context::Impl& BaseConnection::getImpl(Context& context) { return *(context.pimpl_); } MSCCLPP_API_CPP BaseConnection::BaseConnection(std::shared_ptr context, const Endpoint& localEndpoint) - : context_(context), localEndpoint_(localEndpoint), maxWriteQueueSize_(localEndpoint.maxWriteQueueSize()) {} + : context_(context), + localEndpoint_(localEndpoint), + maxWriteQueueSize_(localEndpoint.maxWriteQueueSize()), + gpuFlushDonePos_(detail::gpuCallocHostShared()) {} MSCCLPP_API_CPP std::shared_ptr BaseConnection::context() const { return context_; } @@ -489,6 +492,35 @@ void IBConnection::flush(int64_t timeoutUsec) { #endif } +void IBConnection::requestFlush() { + // No-op: IB sends were already posted by prior conn.write() calls in handleTrigger. + // progressFlush() drives completion by polling the send CQ. +} + +bool IBConnection::progressFlush() { + if (recvThreadError_.load(std::memory_order_acquire)) { + THROW(CONN, Error, ErrorCode::SystemError, "IBConnection recv thread failed: ", recvThreadErrorMsg_); + } + + auto qp = qp_.lock(); + if (!qp || qp->getNumSendCqItems() == 0) { + return true; // QP expired or CQ already drained. + } + + int wcNum = qp->pollSendCq(); + if (wcNum < 0) { + THROW(NET, IbError, errno, "pollSendCq failed in progressFlush"); + } + for (int i = 0; i < wcNum; ++i) { + int status = qp->getSendWcStatus(i); + if (status != static_cast(WsStatus::Success)) { + THROW(NET, Error, ErrorCode::SystemError, + "an IB work item failed in progressFlush: ", qp->getSendWcStatusString(i)); + } + } + return qp->getNumSendCqItems() == 0; +} + // EthernetConnection EthernetConnection::EthernetConnection(std::shared_ptr context, const Endpoint& localEndpoint, diff --git a/src/core/fifo.cc b/src/core/fifo.cc index e0ac9916..b11775d8 100644 --- a/src/core/fifo.cc +++ b/src/core/fifo.cc @@ -53,6 +53,8 @@ MSCCLPP_API_CPP void Fifo::pop() { atomicStore(pimpl_->tail.get(), curTail + 1, memoryOrderRelease); } +MSCCLPP_API_CPP uint64_t Fifo::tail() const { return *(pimpl_->tail); } + MSCCLPP_API_CPP int Fifo::size() const { return pimpl_->size; } MSCCLPP_API_CPP FifoDeviceHandle Fifo::deviceHandle() const { diff --git a/src/core/include/atomic.hpp b/src/core/include/atomic.hpp index 26f549f2..b6079162 100644 --- a/src/core/include/atomic.hpp +++ b/src/core/include/atomic.hpp @@ -4,18 +4,16 @@ #ifndef MSCCLPP_ATOMIC_HPP_ #define MSCCLPP_ATOMIC_HPP_ -#if defined(MSCCLPP_USE_CUDA) -#ifndef MSCCLPP_DEVICE_CUDA +// On CUDA host-side compiles, force atomic_device.hpp's CUDA branch so host code uses +// cuda::atomic_ref (for system-scope ordering with GPU readers). On CUDA device compiles +// (MSCCLPP_DEVICE_CUDA already set by device.hpp) and on ROCm builds, include normally — +// atomic_device.hpp's branch selection works correctly without forcing. +#if defined(MSCCLPP_USE_CUDA) && !defined(MSCCLPP_DEVICE_CUDA) #define MSCCLPP_DEVICE_CUDA #include #undef MSCCLPP_DEVICE_CUDA -#endif // !defined(MSCCLPP_DEVICE_CUDA) -#else // !defined(MSCCLPP_USE_CUDA) -#ifndef MSCCLPP_DEVICE_HIP -#define MSCCLPP_DEVICE_HIP +#else #include -#undef MSCCLPP_DEVICE_HIP -#endif // !defined(MSCCLPP_DEVICE_HIP) -#endif // !defined(MSCCLPP_USE_CUDA) +#endif #endif // MSCCLPP_ATOMIC_HPP_ \ No newline at end of file diff --git a/src/core/include/connection.hpp b/src/core/include/connection.hpp index 22a9930f..eda4b3ef 100644 --- a/src/core/include/connection.hpp +++ b/src/core/include/connection.hpp @@ -50,6 +50,23 @@ class BaseConnection { /// When false, the NIC writes directly to the semaphore's registered memory (e.g., via atomics). virtual bool isSignalForwarding() const { return false; } + /// Request a flush. Subclasses that support async flush (e.g. IBConnection) override this + /// to be a no-op and rely on progressFlush() to drive completion via the proxy thread. + /// The default does a blocking flush(); progressFlush() then trivially returns true. + /// @note Only call from the proxy thread. + virtual void requestFlush() { flush(); } + + /// Progress pending async flush operations (non-blocking CQ poll). + /// @note Only call from the proxy thread. + /// @return true if no flush is pending (CQ fully drained or no request). + virtual bool progressFlush() { return true; } + + /// Get pointer to the GPU-visible flush-done position (host-pinned memory). + /// ProxyService writes "one past the highest completed FIFO position" here when the CQ + /// drains; GPU threads spin on it (`waitFlush`) until it surpasses their own push position. + /// @note Pointer is valid for the lifetime of this Connection. + uint64_t* getFlushDonePtr() const { return gpuFlushDonePos_.get(); } + virtual Transport transport() const = 0; virtual Transport remoteTransport() const = 0; @@ -75,6 +92,11 @@ class BaseConnection { std::shared_ptr context_; Endpoint localEndpoint_; int maxWriteQueueSize_; + + // GPU-visible flush-done position (host-pinned memory). ProxyService writes one past the + // highest FIFO position whose TriggerSync request has fully completed on this connection + // (CQ drained for IB, synchronous flush() returned for non-IB). + std::shared_ptr gpuFlushDonePos_; }; class CudaIpcConnection : public BaseConnection { @@ -149,6 +171,9 @@ class IBConnection : public BaseConnection { void updateAndSync(RegisteredMemory dst, uint64_t dstOffset, uint64_t* src, uint64_t newValue) override; void flush(int64_t timeoutUsec) override; + + void requestFlush() override; + bool progressFlush() override; }; class EthernetConnection : public BaseConnection { diff --git a/src/core/include/proxy_impl.hpp b/src/core/include/proxy_impl.hpp new file mode 100644 index 00000000..a588e5df --- /dev/null +++ b/src/core/include/proxy_impl.hpp @@ -0,0 +1,38 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#ifndef MSCCLPP_PROXY_IMPL_HPP_ +#define MSCCLPP_PROXY_IMPL_HPP_ + +#include +#include +#include +#include +#include +#include + +namespace mscclpp { + +struct Proxy::Impl { + ProxyHandler handler; + std::function threadInit; + std::function progressHandler; + std::shared_ptr fifo; + std::atomic_bool threadStarted; + std::thread service; + std::atomic_bool running; + + Impl(ProxyHandler handler, std::function threadInit, int fifoSize) + : handler(handler), + threadInit(threadInit), + fifo(std::make_shared(fifoSize)), + threadStarted(false), + running(false) {} + + // Must be called before start() — the proxy thread captures progressHandler at start time. + void setProgressHandler(std::function h) { progressHandler = std::move(h); } +}; + +} // namespace mscclpp + +#endif // MSCCLPP_PROXY_IMPL_HPP_ diff --git a/src/core/port_channel.cc b/src/core/port_channel.cc index b8242db3..210b2eeb 100644 --- a/src/core/port_channel.cc +++ b/src/core/port_channel.cc @@ -1,11 +1,14 @@ // Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. +// Licensed under the MIT License. #include #include #include "api.h" -#include "debug.h" +#include "atomic.hpp" +#include "connection.hpp" +#include "logger.hpp" +#include "proxy_impl.hpp" namespace mscclpp { @@ -34,11 +37,12 @@ MSCCLPP_API_CPP ProxyService::ProxyService(int fifoSize) { MSCCLPP_CUDATHROW(cudaSetDevice(cudaDevice)); if (deviceNumaNode >= 0) { numaBind(deviceNumaNode); - INFO(MSCCLPP_INIT, "NUMA node of ProxyService proxy thread is set to %d", deviceNumaNode); + INFO(CONN, "NUMA node of ProxyService proxy thread is set to ", deviceNumaNode); } }; auto handlerFunc = [&](ProxyTrigger triggerRaw) { return handleTrigger(triggerRaw); }; proxy_ = std::make_shared(handlerFunc, initFunc, fifoSize); + proxy_->pimpl_->setProgressHandler([this]() { progressFlushes(); }); } MSCCLPP_API_CPP SemaphoreId ProxyService::buildAndAddSemaphore(Communicator& communicator, @@ -84,9 +88,28 @@ MSCCLPP_API_CPP PortChannel ProxyService::portChannel(SemaphoreId id, MemoryId d MSCCLPP_API_CPP void ProxyService::startProxy(bool blocking) { proxy_->start(blocking); } -MSCCLPP_API_CPP void ProxyService::stopProxy() { proxy_->stop(); } +MSCCLPP_API_CPP void ProxyService::stopProxy() { + proxy_->stop(); + // Drain pending flushes. After a bounded loop, force-unblock any still-pending GPU + // waiters with a sentinel write (UINT64_MAX > any FIFO position). + for (int i = 0; i < 1000 && !pendingFlushPos_.empty(); ++i) { + progressFlushes(); + } + if (!pendingFlushPos_.empty()) { + WARN(CONN, "stopProxy: ", pendingFlushPos_.size(), " connections still pending; writing sentinel"); + for (auto& [conn, pos] : pendingFlushPos_) { + if (uint64_t* ptr = conn->getFlushDonePtr()) atomicStore(ptr, UINT64_MAX, memoryOrderRelease); + } + pendingFlushPos_.clear(); + } +} ProxyHandlerResult ProxyService::handleTrigger(ProxyTrigger trigger) { + // The proxy is the sole FIFO consumer and processes in strict push order, so the FIFO's + // tail (between poll() and pop()) matches the value GPU's fifo_.push() returned for this + // trigger — use it directly as our per-trigger sequence number. + uint64_t pos = proxy_->fifo()->tail(); + std::shared_ptr semaphore = semaphores_[trigger.fields.semaphoreId]; auto& conn = semaphore->connection(); @@ -105,9 +128,15 @@ ProxyHandlerResult ProxyService::handleTrigger(ProxyTrigger trigger) { numRequests++; } - if (((trigger.fields.type & TriggerSync) && numRequests > 0) || - (maxWriteQueueSize != -1 && numRequests >= maxWriteQueueSize)) { - conn.flush(); + if (trigger.fields.type & TriggerSync) { + // Record this TriggerSync's FIFO position. The GPU caller is spinning on + // flushDonePos_ > pos; progressFlushes() will publish pos+1 once the CQ drains. + // Later TriggerSyncs on the same conn overwrite — CQ drain completes them all at once. + conn.impl_->requestFlush(); + pendingFlushPos_[conn.impl_] = pos; + numRequests = 0; + } else if (maxWriteQueueSize != -1 && numRequests >= maxWriteQueueSize) { + conn.flush(); // flow-control flush stays blocking numRequests = 0; } @@ -115,12 +144,27 @@ ProxyHandlerResult ProxyService::handleTrigger(ProxyTrigger trigger) { } MSCCLPP_API_CPP BasePortChannel::DeviceHandle BasePortChannel::deviceHandle() const { - return BasePortChannel::DeviceHandle(semaphoreId_, semaphore_->deviceHandle(), proxy_->fifo()->deviceHandle()); + auto& conn = semaphore_->connection(); + return BasePortChannel::DeviceHandle(semaphoreId_, semaphore_->deviceHandle(), proxy_->fifo()->deviceHandle(), + conn.impl_->getFlushDonePtr()); } MSCCLPP_API_CPP PortChannel::DeviceHandle PortChannel::deviceHandle() const { - return PortChannel::DeviceHandle(semaphoreId_, semaphore_->deviceHandle(), proxy_->fifo()->deviceHandle(), dst_, - src_); + auto& conn = semaphore_->connection(); + return PortChannel::DeviceHandle(semaphoreId_, semaphore_->deviceHandle(), proxy_->fifo()->deviceHandle(), dst_, src_, + conn.impl_->getFlushDonePtr()); +} + +void ProxyService::progressFlushes() { + for (auto it = pendingFlushPos_.begin(); it != pendingFlushPos_.end();) { + if (it->first->progressFlush()) { + // CQ drained: publish pos+1 to unblock GPU waiters whose own pos <= recorded pos. + atomicStore(it->first->getFlushDonePtr(), it->second + 1, memoryOrderRelease); + it = pendingFlushPos_.erase(it); + } else { + ++it; + } + } } } // namespace mscclpp diff --git a/src/core/proxy.cc b/src/core/proxy.cc index de5b90fc..554336e8 100644 --- a/src/core/proxy.cc +++ b/src/core/proxy.cc @@ -1,38 +1,21 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. -#include #include #include #include #include #include -#include #include "api.h" -#include "debug.h" +#include "logger.hpp" +#include "proxy_impl.hpp" namespace mscclpp { constexpr int ProxyStopCheckPeriod = 1000; constexpr int ProxyStartWarnPeriod = 1000; -struct Proxy::Impl { - ProxyHandler handler; - std::function threadInit; - std::shared_ptr fifo; - std::atomic_bool threadStarted; - std::thread service; - std::atomic_bool running; - - Impl(ProxyHandler handler, std::function threadInit, int fifoSize) - : handler(handler), - threadInit(threadInit), - fifo(std::make_shared(fifoSize)), - threadStarted(false), - running(false) {} -}; - MSCCLPP_API_CPP Proxy::Proxy(ProxyHandler handler, std::function threadInit, int fifoSize) { pimpl_ = std::make_unique(handler, threadInit, fifoSize); } @@ -70,18 +53,23 @@ MSCCLPP_API_CPP void Proxy::start(bool blocking) { pimpl_->threadStarted.store(true, std::memory_order_release); - ProxyHandler handler = this->pimpl_->handler; - auto fifo = this->pimpl_->fifo; + ProxyHandler handler = pimpl_->handler; + auto progressHandler = pimpl_->progressHandler; + auto fifo = pimpl_->fifo; ProxyTrigger trigger; int runCnt = ProxyStopCheckPeriod; for (;;) { if (runCnt-- == 0) { runCnt = ProxyStopCheckPeriod; - if (!this->pimpl_->running.load(std::memory_order_acquire)) { + if (!pimpl_->running.load(std::memory_order_acquire)) { break; } } + // Per-iteration system work (e.g. progressing pending async flushes), + // run regardless of whether a trigger is ready this iteration. + if (progressHandler) progressHandler(); + // Poll to see if we are ready to send anything trigger = fifo->poll(); if (trigger.fst == 0 || trigger.snd == 0) { // TODO: this check is a potential pitfall for custom triggers @@ -107,7 +95,7 @@ MSCCLPP_API_CPP void Proxy::start(bool blocking) { count--; if (count == 0) { count = ProxyStartWarnPeriod; - WARN("Proxy thread startup taking longer than expected."); + WARN(CONN, "Proxy thread startup taking longer than expected."); } } } diff --git a/test/mp_unit/mp_unit_tests.hpp b/test/mp_unit/mp_unit_tests.hpp index f4a26cf9..eb8b5485 100644 --- a/test/mp_unit/mp_unit_tests.hpp +++ b/test/mp_unit/mp_unit_tests.hpp @@ -160,6 +160,14 @@ class PortChannelOneToOneTest : public CommunicatorTestBase { void testPacketPingPong(bool useIbOnly, IbMode ibMode = IbMode::Default); void testPacketPingPongPerf(bool useIbOnly, IbMode ibMode = IbMode::Default); void testBandwidth(PingPongTestParams params); + void setupMultiQpChannels(int numQps, size_t elemsPerChan, IbMode ibMode, int tagBase, + std::vector>& sendBuffs, + std::vector& localMems, + std::vector& remoteMems, + std::vector& portChannels); + void testMultiQpBandwidth(IbMode ibMode, int numQps); + void testMultiQpFlushStress(IbMode ibMode, int numQps); + void testSameChanConcurrentFlush(IbMode ibMode); std::shared_ptr proxyService; }; diff --git a/test/mp_unit/port_channel_tests.cu b/test/mp_unit/port_channel_tests.cu index 4b1b0cfb..47034cdb 100644 --- a/test/mp_unit/port_channel_tests.cu +++ b/test/mp_unit/port_channel_tests.cu @@ -625,3 +625,292 @@ PERF_TEST(PortChannelOneToOneTest, BandwidthIbHostNoAtomicMode) { testBandwidth(PingPongTestParams{ .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::HostNoAtomic}); } + +static constexpr int kMaxQps = 4; +__constant__ DeviceHandle gMultiQpPortChans[kMaxQps]; + +// Multi-QP bandwidth kernel: barrier on QP 0 only, then putWithSignal on all QPs. +// Only one signal/wait pair is needed for sync between two GPU kernels. +__global__ void kernelMultiQpBandwidth(int nElemPerChan, int nIters, int numQps) { + if (threadIdx.x != 0) return; + for (int i = 0; i < nIters; i++) { + // Barrier on QP 0 only — syncs both ranks + gMultiQpPortChans[0].signal(); + gMultiQpPortChans[0].wait(); + // Data transfer: put on all QPs simultaneously + for (int q = 0; q < numQps; q++) { + gMultiQpPortChans[q].putWithSignal(0, nElemPerChan * sizeof(int)); + } + // Wait for all remote data arrivals + for (int q = 0; q < numQps; q++) { + gMultiQpPortChans[q].wait(); + } + } +} + +// Multi-QP setup helper: bootstrap N parallel IB connections + port channels in two +// futures-based phases (issue all async ops before resolving any, to avoid deadlock). +// tagBase: distinct base used by each caller so concurrent tests don't clash on tags. +void PortChannelOneToOneTest::setupMultiQpChannels(int numQps, size_t elemsPerChan, IbMode ibMode, int tagBase, + std::vector>& sendBuffs, + std::vector& localMems, + std::vector& remoteMems, + std::vector& portChannels) { + const int peer = 1 - communicator->bootstrap()->getRank(); + sendBuffs.assign(numQps, nullptr); + localMems.assign(numQps, mscclpp::RegisteredMemory{}); + remoteMems.assign(numQps, mscclpp::RegisteredMemory{}); + portChannels.clear(); + + std::vector> connFutures(numQps); + std::vector> remoteMemFutures(numQps); + + for (int q = 0; q < numQps; q++) { + sendBuffs[q] = mscclpp::GpuBuffer(elemsPerChan).memory(); + localMems[q] = communicator->registerMemory(sendBuffs[q].get(), elemsPerChan * sizeof(int), ibTransport); + + mscclpp::EndpointConfig cfg; + cfg.transport = ibTransport; + cfg.ib.gidIndex = std::stoi(gEnv->args["ib_gid_index"]); + cfg.ib.mode = ibMode; + + connFutures[q] = communicator->connect(cfg, peer, tagBase + q); + communicator->sendMemory(localMems[q], peer, tagBase + numQps + q); + remoteMemFutures[q] = communicator->recvMemory(peer, tagBase + numQps + q); + } + + for (int q = 0; q < numQps; q++) { + auto conn = connFutures[q].get(); + remoteMems[q] = remoteMemFutures[q].get(); + auto sema = communicator->buildSemaphore(conn, peer, tagBase + 2 * numQps + q).get(); + mscclpp::SemaphoreId cid = proxyService->addSemaphore(sema); + portChannels.emplace_back( + proxyService->portChannel(cid, proxyService->addMemory(remoteMems[q]), proxyService->addMemory(localMems[q]))); + } +} + +void PortChannelOneToOneTest::testMultiQpBandwidth(IbMode ibMode, int numQps) { + if (gEnv->rank >= numRanksToUse) return; + + const int rank = communicator->bootstrap()->getRank(); + const int maxElemPerChan = 32 * 1024 * 1024; // 128 MB per channel + + std::vector> sendBuffs; + std::vector localMems; + std::vector remoteMems; + std::vector portChannels; + setupMultiQpChannels(numQps, maxElemPerChan, ibMode, /*tagBase=*/100, sendBuffs, localMems, remoteMems, portChannels); + + std::vector> handles; + for (auto& ch : portChannels) handles.push_back(ch.deviceHandle()); + ASSERT_EQ(handles.size(), static_cast(numQps)); + ASSERT_LE(numQps, kMaxQps); // numQps must not exceed __constant__ array size (kMaxQps) + MSCCLPP_CUDATHROW( + cudaMemcpyToSymbol(gMultiQpPortChans, handles.data(), numQps * sizeof(DeviceHandle))); + + proxyService->startProxy(); + + const std::string testName = ::mscclpp::test::currentTestName(); + const std::string qpLabel = std::to_string(numQps) + " QP" + (numQps > 1 ? "s" : ""); + + for (int nElemPerChan : + {256, 16 * 1024, 256 * 1024, 1024 * 1024, 4 * 1024 * 1024, 16 * 1024 * 1024, 32 * 1024 * 1024}) { + int nIters = 10000; + // Warm-up + kernelMultiQpBandwidth<<<1, 1>>>(nElemPerChan, 10, numQps); + MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); + communicator->bootstrap()->barrier(); + + // Measure + mscclpp::Timer timer; + kernelMultiQpBandwidth<<<1, 1>>>(nElemPerChan, nIters, numQps); + MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); + double elapsedUs = timer.elapsed(); + communicator->bootstrap()->barrier(); + + if (rank == 0) { + double totalBytes = (double)nElemPerChan * sizeof(int) * numQps; + double elapsedMsPerIter = elapsedUs / 1e3 / nIters; + double gbps = totalBytes / elapsedMsPerIter * 1e-6; + double totalSizeKB = totalBytes / 1024.0; + std::string label; + if (totalSizeKB >= 1024.0) + label = std::to_string((int)(totalSizeKB / 1024.0)) + " MB"; + else + label = std::to_string((int)totalSizeKB) + " KB"; + ::mscclpp::test::reportPerfResult(label + " (" + qpLabel + ")", gbps, "GB/s"); + } + } + + proxyService->stopProxy(); + + for (auto& m : localMems) registeredMemories.push_back(m); + for (auto& m : remoteMems) registeredMemories.push_back(m); +} + +PERF_TEST(PortChannelOneToOneTest, MultiQpBandwidthIbHostMode) { + REQUIRE_IBVERBS; + REQUIRE_GDR_FOR_IB_MODE(IbMode::Host); + for (int numQps : {1, 2, 4}) { + testMultiQpBandwidth(IbMode::Host, numQps); + } +} + +PERF_TEST(PortChannelOneToOneTest, MultiQpBandwidthIbHostNoAtomicMode) { + REQUIRE_IBVERBS; + REQUIRE_GDR_FOR_IB_MODE(IbMode::HostNoAtomic); + for (int numQps : {1, 2, 4}) { + testMultiQpBandwidth(IbMode::HostNoAtomic, numQps); + } +} + +// Multi-QP flush-stress kernel: one thread per QP, all calling putWithSignalAndFlush +// concurrently so all N CQ drains are in flight on the proxy thread at once. +// This is the concurrent-flush worst case the async-progress design protects against. +__global__ void kernelMultiQpFlushStress(int nElemPerChan, int nIters, int numQps) { + int q = threadIdx.x; + if (q >= numQps) return; + for (int i = 0; i < nIters; i++) { + if (q == 0) { + gMultiQpPortChans[0].signal(); + gMultiQpPortChans[0].wait(); + } + __syncthreads(); + gMultiQpPortChans[q].putWithSignalAndFlush(0, nElemPerChan * sizeof(int)); + gMultiQpPortChans[q].wait(); + __syncthreads(); + } +} + +void PortChannelOneToOneTest::testMultiQpFlushStress(IbMode ibMode, int numQps) { + if (gEnv->rank >= numRanksToUse) return; + + const int rank = communicator->bootstrap()->getRank(); + const int maxElemPerChan = 64 * 1024; + + std::vector> sendBuffs; + std::vector localMems; + std::vector remoteMems; + std::vector portChannels; + setupMultiQpChannels(numQps, maxElemPerChan, ibMode, /*tagBase=*/400, sendBuffs, localMems, remoteMems, portChannels); + + std::vector> handles; + for (auto& ch : portChannels) handles.push_back(ch.deviceHandle()); + ASSERT_EQ(handles.size(), static_cast(numQps)); + ASSERT_LE(numQps, kMaxQps); + MSCCLPP_CUDATHROW( + cudaMemcpyToSymbol(gMultiQpPortChans, handles.data(), numQps * sizeof(DeviceHandle))); + + proxyService->startProxy(); + + const std::string qpLabel = std::to_string(numQps) + " QP" + (numQps > 1 ? "s" : ""); + + for (int nElemPerChan : {256, 4 * 1024, 64 * 1024}) { + int nIters = 2000; + kernelMultiQpFlushStress<<<1, numQps>>>(nElemPerChan, 10, numQps); + MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); + communicator->bootstrap()->barrier(); + + mscclpp::Timer timer; + kernelMultiQpFlushStress<<<1, numQps>>>(nElemPerChan, nIters, numQps); + MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); + double elapsedUs = timer.elapsed(); + communicator->bootstrap()->barrier(); + + if (rank == 0) { + double usPerIter = elapsedUs / nIters; + double usPerIterPerQp = usPerIter / numQps; + int bytesPerChan = nElemPerChan * (int)sizeof(int); + std::string sizeLabel = (bytesPerChan >= 1024) ? (std::to_string(bytesPerChan / 1024) + " KB") + : (std::to_string(bytesPerChan) + " B"); + ::mscclpp::test::reportPerfResult(sizeLabel + " (" + qpLabel + ") per-iter", usPerIter, "us"); + ::mscclpp::test::reportPerfResult(sizeLabel + " (" + qpLabel + ") per-iter/QP", usPerIterPerQp, "us"); + } + } + + proxyService->stopProxy(); + + for (auto& m : localMems) registeredMemories.push_back(m); + for (auto& m : remoteMems) registeredMemories.push_back(m); +} + +PERF_TEST(PortChannelOneToOneTest, MultiQpFlushStressIbHostMode) { + REQUIRE_IBVERBS; + REQUIRE_GDR_FOR_IB_MODE(IbMode::Host); + for (int numQps : {1, 2, 4}) { + testMultiQpFlushStress(IbMode::Host, numQps); + } +} + +PERF_TEST(PortChannelOneToOneTest, MultiQpFlushStressIbHostNoAtomicMode) { + REQUIRE_IBVERBS; + REQUIRE_GDR_FOR_IB_MODE(IbMode::HostNoAtomic); + for (int numQps : {1, 2, 4}) { + testMultiQpFlushStress(IbMode::HostNoAtomic, numQps); + } +} + +// Same-channel concurrent-flush kernel: N GPU threads on the same PortChannel each call +// putWithSignalAndFlush in lockstep. Stresses the FIFO-position-based wait target so that +// each caller waits on its own TriggerSync rather than on a globally-incrementing counter +// that could be assigned out-of-order relative to the FIFO push order. +__constant__ DeviceHandle gSingleChanForConcurrentFlush; + +__global__ void kernelSameChanConcurrentFlush(int nIters) { + auto& chan = gSingleChanForConcurrentFlush; + int tid = threadIdx.x; + for (int i = 0; i < nIters; i++) { + // Each thread writes to a distinct slot (so puts don't overlap on remote side), + // then concurrently flushes on the same channel. + uint64_t offset = tid * sizeof(int); + chan.putWithSignalAndFlush(offset, offset, sizeof(int)); + // Each thread waits for one signal from the remote rank's symmetric putWithSignalAndFlush. + chan.wait(); + } +} + +void PortChannelOneToOneTest::testSameChanConcurrentFlush(IbMode ibMode) { + if (gEnv->rank >= numRanksToUse) return; + + constexpr int nThreads = 4; + std::vector> sendBuffs; + std::vector localMems; + std::vector remoteMems; + std::vector portChannels; + setupMultiQpChannels(/*numQps=*/1, /*elemsPerChan=*/nThreads, ibMode, /*tagBase=*/700, sendBuffs, localMems, + remoteMems, portChannels); + + DeviceHandle handle = portChannels[0].deviceHandle(); + MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gSingleChanForConcurrentFlush, &handle, sizeof(handle))); + + proxyService->startProxy(); + + // Warm-up + kernelSameChanConcurrentFlush<<<1, nThreads>>>(10); + MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); + communicator->bootstrap()->barrier(); + + // Measure: a successful completion (no deadlock, no CQ error) validates that each + // concurrent-flush caller waited on its own TriggerSync (not someone else's earlier one). + const int nIters = 500; + mscclpp::Timer timer; + kernelSameChanConcurrentFlush<<<1, nThreads>>>(nIters); + MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); + double elapsedUs = timer.elapsed(); + communicator->bootstrap()->barrier(); + + if (communicator->bootstrap()->getRank() == 0) { + double usPerIter = elapsedUs / nIters; + ::mscclpp::test::reportPerfResult(std::to_string(nThreads) + " threads same-chan per-iter", usPerIter, "us"); + } + + proxyService->stopProxy(); + for (auto& m : localMems) registeredMemories.push_back(m); + for (auto& m : remoteMems) registeredMemories.push_back(m); +} + +TEST(PortChannelOneToOneTest, SameChanConcurrentFlushIbHostMode) { + REQUIRE_IBVERBS; + REQUIRE_GDR_FOR_IB_MODE(IbMode::Host); + testSameChanConcurrentFlush(IbMode::Host); +} From 60a6d7219f1d97c32c275864ed3fb456b6b3f300 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Fri, 15 May 2026 14:06:50 -0700 Subject: [PATCH 11/12] Clean up completed communicator receives (#804) ## Summary - Release the reference after last requests are ready. - Keep ordered receive chaining for repeated rank/tag operations while cleaning up completed receive bookkeeping. --------- Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/core/communicator.cc | 102 +++++++++++++++++------------- src/core/include/communicator.hpp | 5 +- 2 files changed, 62 insertions(+), 45 deletions(-) diff --git a/src/core/communicator.cc b/src/core/communicator.cc index c95ca421..41e46bc5 100644 --- a/src/core/communicator.cc +++ b/src/core/communicator.cc @@ -7,6 +7,36 @@ namespace mscclpp { +namespace { + +template +std::shared_future makeOrderedRecvFuture(Impl* impl, int remoteRank, int tag, Func func) { + // Weak placeholder to avoid a reference cycle; updated with the real recvItem after the future is created. + auto thisRecvItem = std::make_shared>(); + auto future = std::async(std::launch::deferred, + [impl, remoteRank, tag, thisRecvItem, lastRecvItem = impl->getLastRecvItem(remoteRank, tag), + func = std::move(func)]() mutable { + auto cleanup = [impl, remoteRank, tag, thisRecvItem]() { + impl->clearLastRecvItemIfMatches(remoteRank, tag, thisRecvItem->lock()); + }; + + if (lastRecvItem) { + // Recursive call to the previous receive items + lastRecvItem->wait(); + } + auto result = func(); + cleanup(); + return result; + }); + auto sharedFuture = std::shared_future(std::move(future)); + auto recvItem = std::make_shared>(sharedFuture); + *thisRecvItem = recvItem; + impl->setLastRecvItem(remoteRank, tag, recvItem); + return sharedFuture; +} + +} // namespace + Communicator::Impl::Impl(std::shared_ptr bootstrap, std::shared_ptr context) : bootstrap_(bootstrap) { if (!context) { @@ -32,6 +62,14 @@ std::shared_ptr Communicator::Impl::getLastRecvItem(int remoteRank return it->second; } +void Communicator::Impl::clearLastRecvItemIfMatches(int remoteRank, int tag, + const std::shared_ptr& expectedItem) { + auto it = lastRecvItems_.find({remoteRank, tag}); + if (it != lastRecvItems_.end() && it->second == expectedItem) { + lastRecvItems_.erase(it); + } +} + MSCCLPP_API_CPP Communicator::~Communicator() = default; MSCCLPP_API_CPP Communicator::Communicator(std::shared_ptr bootstrap, std::shared_ptr context) @@ -83,19 +121,11 @@ MSCCLPP_API_CPP std::shared_future Communicator::recvMemory(in locRecvMemList.push_back(std::move(locRecvMem)); return future; } - auto future = std::async(std::launch::deferred, - [this, remoteRank, tag, lastRecvItem = pimpl_->getLastRecvItem(remoteRank, tag)]() { - if (lastRecvItem) { - // Recursive call to the previous receive items - lastRecvItem->wait(); - } - std::vector data; - bootstrap()->recv(data, remoteRank, tag); - return RegisteredMemory::deserialize(data); - }); - auto shared_future = std::shared_future(std::move(future)); - pimpl_->setLastRecvItem(remoteRank, tag, std::make_shared>(shared_future)); - return shared_future; + return makeOrderedRecvFuture(pimpl_.get(), remoteRank, tag, [this, remoteRank, tag]() { + std::vector data; + bootstrap()->recv(data, remoteRank, tag); + return RegisteredMemory::deserialize(data); + }); } MSCCLPP_API_CPP std::shared_future Communicator::connect(const Endpoint& localEndpoint, int remoteRank, @@ -112,22 +142,15 @@ MSCCLPP_API_CPP std::shared_future Communicator::connect(const Endpo bootstrap()->send(localEndpoint.serialize(), remoteRank, tag); - auto future = std::async(std::launch::deferred, [this, remoteRank, tag, localEndpoint, - lastRecvItem = pimpl_->getLastRecvItem(remoteRank, tag)]() mutable { - if (lastRecvItem) { - // Recursive call to the previous receive items - lastRecvItem->wait(); - } - std::vector data; - bootstrap()->recv(data, remoteRank, tag); - auto remoteEndpoint = Endpoint::deserialize(data); - auto connection = context()->connect(localEndpoint, remoteEndpoint); - pimpl_->connectionInfos_[connection.impl_.get()] = {remoteRank, tag}; - return connection; - }); - auto shared_future = std::shared_future(std::move(future)); - pimpl_->setLastRecvItem(remoteRank, tag, std::make_shared>(shared_future)); - return shared_future; + return makeOrderedRecvFuture(pimpl_.get(), remoteRank, tag, + [this, remoteRank, tag, localEndpoint]() mutable { + std::vector data; + bootstrap()->recv(data, remoteRank, tag); + auto remoteEndpoint = Endpoint::deserialize(data); + auto connection = context()->connect(localEndpoint, remoteEndpoint); + pimpl_->connectionInfos_[connection.impl_.get()] = {remoteRank, tag}; + return connection; + }); } MSCCLPP_API_CPP std::shared_future Communicator::connect(const EndpointConfig& localConfig, int remoteRank, @@ -141,21 +164,12 @@ MSCCLPP_API_CPP std::shared_future Communicator::buildSemaphore(const SemaphoreStub localStub(connection); bootstrap()->send(localStub.serialize(), remoteRank, tag); - auto future = - std::async(std::launch::deferred, [this, remoteRank, tag, lastRecvItem = pimpl_->getLastRecvItem(remoteRank, tag), - localStub = localStub]() mutable { - if (lastRecvItem) { - // Recursive call to the previous receive items - lastRecvItem->wait(); - } - std::vector data; - bootstrap()->recv(data, remoteRank, tag); - auto remoteStub = SemaphoreStub::deserialize(data); - return Semaphore(localStub, remoteStub); - }); - auto shared_future = std::shared_future(std::move(future)); - pimpl_->setLastRecvItem(remoteRank, tag, std::make_shared>(shared_future)); - return shared_future; + return makeOrderedRecvFuture(pimpl_.get(), remoteRank, tag, [this, remoteRank, tag, localStub]() mutable { + std::vector data; + bootstrap()->recv(data, remoteRank, tag); + auto remoteStub = SemaphoreStub::deserialize(data); + return Semaphore(localStub, remoteStub); + }); } MSCCLPP_API_CPP int Communicator::remoteRankOf(const Connection& connection) { diff --git a/src/core/include/communicator.hpp b/src/core/include/communicator.hpp index 8d7539ef..f15e20f7 100644 --- a/src/core/include/communicator.hpp +++ b/src/core/include/communicator.hpp @@ -62,7 +62,7 @@ struct Communicator::Impl { std::unordered_map connectionInfos_; // Temporary storage for the latest RecvItem of each {remoteRank, tag} pair. - // If the RecvItem gets ready, it will be removed at the next call to getLastRecvItem. + // The RecvItem is removed when it finishes or when getLastRecvItem observes that it is ready. std::unordered_map, std::shared_ptr, PairHash> lastRecvItems_; // RegisteredMemory items sent to the local rank of each tag. Sending memory to the local rank is @@ -79,6 +79,9 @@ struct Communicator::Impl { // If the item is ready, it will be removed from the map and nullptr will be returned. std::shared_ptr getLastRecvItem(int remoteRank, int tag); + // Clear the last RecvItem only if it still matches the expected item. + void clearLastRecvItemIfMatches(int remoteRank, int tag, const std::shared_ptr& expectedItem); + struct Connector; }; From 20bd1ec55b9cb12c03db05b2fad8df92fe0c1fc1 Mon Sep 17 00:00:00 2001 From: Qinghua Zhou Date: Mon, 18 May 2026 21:44:20 +0000 Subject: [PATCH 12/12] ext/ep: fix CUDA 11.8 build + apply clang-format/black - src/core/atomicadd_kernel.cu: restore the legacy 3-arg cuCtxCreate(&proxyAtomicCtx_, 0, cuDevice) in the '#else' branch of the CUDA_VERSION >= 12050 guard. A prior edit had corrupted it to 'cuCtxCreate(&proxyAtomicCtx_vice)', which broke the CUDA 11.8 build (CodeQL CUDA cuda11.8 and MSCCLPPLang cuda11.8 jobs). - Apply clang-format to src/ext/ep/* (no logic changes, fixes the cpplint CI job). - Apply black to test/python/ext/ep/test_internode_multirank.py and test_intranode_multirank.py (no logic changes, fixes the pylint CI job). --- src/core/atomicadd_kernel.cu | 8 +- src/ext/ep/buffer.cc | 104 +++++---- src/ext/ep/buffer.hpp | 4 +- src/ext/ep/kernels/api.cuh | 21 +- src/ext/ep/kernels/buffer.cuh | 3 +- src/ext/ep/kernels/internode.cu | 210 ++++++++---------- src/ext/ep/kernels/internode_ll.cu | 18 +- src/ext/ep/kernels/utils.cuh | 2 +- .../python/ext/ep/test_internode_multirank.py | 9 +- .../python/ext/ep/test_intranode_multirank.py | 12 +- 10 files changed, 177 insertions(+), 214 deletions(-) diff --git a/src/core/atomicadd_kernel.cu b/src/core/atomicadd_kernel.cu index 21427df6..f4ae3b02 100644 --- a/src/core/atomicadd_kernel.cu +++ b/src/core/atomicadd_kernel.cu @@ -37,13 +37,13 @@ void CudaIpcStream::atomicAdd(uint64_t* dst, int64_t value) { CUresult res = cuDeviceGet(&cuDevice, deviceId_); if (res != CUDA_SUCCESS) throw Error("cuDeviceGet failed", ErrorCode::InternalError); - // cuCtxCreate added a `paramsArray` argument in CUDA 12.5 — use the - // 4-arg form on new toolkits, fall back to the legacy 3-arg form on - // CUDA < 12.5 so we keep compiling against older drivers/toolkits. + // cuCtxCreate added a `paramsArray` argument in CUDA 12.5 — use the + // 4-arg form on new toolkits, fall back to the legacy 3-arg form on + // CUDA < 12.5 so we keep compiling against older drivers/toolkits. #if CUDA_VERSION >= 12050 res = cuCtxCreate_v4(&proxyAtomicCtx_, NULL, 0, cuDevice); #else - res = cuCtxCreate(&proxyAtomicCtx_vice); + res = cuCtxCreate(&proxyAtomicCtx_, 0, cuDevice); #endif if (res != CUDA_SUCCESS) throw Error("cuCtxCreate failed", ErrorCode::InternalError); diff --git a/src/ext/ep/buffer.cc b/src/ext/ep/buffer.cc index 02b38c36..f6e5870b 100644 --- a/src/ext/ep/buffer.cc +++ b/src/ext/ep/buffer.cc @@ -154,8 +154,7 @@ Buffer::Buffer(int rank, int num_ranks, int64_t num_nvl_bytes, int64_t num_rdma_ if (v > 0 && v <= NUM_MAX_NVL_PEERS) local_world_size = v; } rdma_rank = rank / local_world_size, nvl_rank = rank % local_world_size; - num_rdma_ranks = std::max(1, num_ranks / local_world_size), - num_nvl_ranks = std::min(num_ranks, local_world_size); + num_rdma_ranks = std::max(1, num_ranks / local_world_size), num_nvl_ranks = std::min(num_ranks, local_world_size); // Get device info cudaDeviceProp device_prop = {}; @@ -404,8 +403,7 @@ void Buffer::sync(const std::vector& device_ids, // worker ranks cannot reach. static const bool fabric_ipc_supported = resolve_fabric_ipc_supported(); const bool use_fabric_ipc_alloc = - mscclpp::isNvlsSupported() && fabric_ipc_supported && - (low_latency_mode || num_rdma_ranks > 1); + mscclpp::isNvlsSupported() && fabric_ipc_supported && (low_latency_mode || num_rdma_ranks > 1); if (use_fabric_ipc_alloc) { rdma_buffer_ptr = mscclpp::detail::gpuCallocPhysical(num_rdma_bytes); CUDA_CHECK(cudaMemset(rdma_buffer_ptr, 0, num_rdma_bytes)); @@ -415,8 +413,8 @@ void Buffer::sync(const std::vector& device_ids, } if (rank == 0) { printf("[mscclpp_ep] rdma_buffer allocator: %s (low_latency=%d, nvls=%d, fabric_ipc=%d)\n", - use_fabric_ipc_alloc ? "gpuCallocPhysical (fabric-IPC)" : "cudaMalloc", - (int)low_latency_mode, (int)mscclpp::isNvlsSupported(), (int)fabric_ipc_supported); + use_fabric_ipc_alloc ? "gpuCallocPhysical (fabric-IPC)" : "cudaMalloc", (int)low_latency_mode, + (int)mscclpp::isNvlsSupported(), (int)fabric_ipc_supported); fflush(stdout); } bootstrap->barrier(); @@ -584,10 +582,8 @@ void Buffer::sync(const std::vector& device_ids, // one slot per global rank (worst-case num_ranks = NUM_MAX_RDMA_PEERS * // NUM_MAX_NVL_PEERS). Each rank writes its own slot via `multimem.st`; // every receiver then reads the sub-position destined to it. - const size_t kPerSenderSlotBytes = - static_cast(NUM_MAX_RDMA_PEERS) * kNvlsPerPeerBytes; - const size_t kMaxRanks = - static_cast(NUM_MAX_RDMA_PEERS) * NUM_MAX_NVL_PEERS; + const size_t kPerSenderSlotBytes = static_cast(NUM_MAX_RDMA_PEERS) * kNvlsPerPeerBytes; + const size_t kMaxRanks = static_cast(NUM_MAX_RDMA_PEERS) * NUM_MAX_NVL_PEERS; const size_t data_bytes = kMaxRanks * kPerSenderSlotBytes; // 256 B alignment for each sub-region to keep `multimem` ops well-aligned. @@ -612,8 +608,8 @@ void Buffer::sync(const std::vector& device_ids, // propagates — there is no clean fallback mid-collective. The // `isNvlsSupported()` gate above is the production guard. nvls_ht_conn = mscclpp::connectNvlsCollective(communicator, all_ranks, nvls_ht_buffer->bytes()); - auto sw = nvls_ht_conn->bindAllocatedMemory( - reinterpret_cast(nvls_ht_buffer->data()), nvls_ht_buffer->bytes()); + auto sw = nvls_ht_conn->bindAllocatedMemory(reinterpret_cast(nvls_ht_buffer->data()), + nvls_ht_buffer->bytes()); nvls_ht_sc = std::make_shared(std::move(sw)); auto h = nvls_ht_sc->deviceHandle(); nvls_ht_mc_ptr = h.mcPtr; @@ -623,10 +619,9 @@ void Buffer::sync(const std::vector& device_ids, // DIAG: print mcPtr/devicePtr/buf-VA per rank to verify whether // connectNvlsCollective produced a multicast that actually spans // both nodes (suspected: per-node only on Azure GB200). - printf( - "[mscclpp_ep] NVLS HT diag rank=%d mcPtr=%p devicePtr=%p bufVA=%p bytes=%zu\n", - rank, (void*)nvls_ht_mc_ptr, (void*)nvls_ht_dev_ptr, - (void*)nvls_ht_buffer->data(), (size_t)nvls_ht_buffer->bytes()); + printf("[mscclpp_ep] NVLS HT diag rank=%d mcPtr=%p devicePtr=%p bufVA=%p bytes=%zu\n", rank, + (void*)nvls_ht_mc_ptr, (void*)nvls_ht_dev_ptr, (void*)nvls_ht_buffer->data(), + (size_t)nvls_ht_buffer->bytes()); fflush(stdout); bootstrap->barrier(); @@ -635,16 +630,15 @@ void Buffer::sync(const std::vector& device_ids, printf( "[mscclpp_ep] NVLS HT multicast: enabled=%d total=%zu KB " "(tail@%zu head@%zu barrier@%zu data@%zu)\n", - (int)nvls_ht_enabled, nvls_ht_total_bytes / 1024, nvls_ht_off_tail, nvls_ht_off_head, - nvls_ht_off_barrier, nvls_ht_off_data); + (int)nvls_ht_enabled, nvls_ht_total_bytes / 1024, nvls_ht_off_tail, nvls_ht_off_head, nvls_ht_off_barrier, + nvls_ht_off_data); fflush(stdout); } } else if (rank == 0) { printf( "[mscclpp_ep] NVLS HT multicast: disabled (low_latency=%d, num_rdma_ranks=%d, " "nvls_supported=%d, fabric_ipc=%d)\n", - (int)low_latency_mode, num_rdma_ranks, (int)mscclpp::isNvlsSupported(), - (int)fabric_ipc_supported); + (int)low_latency_mode, num_rdma_ranks, (int)mscclpp::isNvlsSupported(), (int)fabric_ipc_supported); fflush(stdout); } @@ -717,8 +711,8 @@ void Buffer::sync(const std::vector& device_ids, peer_rdma_bases[r] = remote_mems[r].data(); } CUDA_CHECK(cudaMalloc(&peer_rdma_bases_gpu, sizeof(void*) * num_ranks)); - CUDA_CHECK(cudaMemcpy(peer_rdma_bases_gpu, peer_rdma_bases.data(), - sizeof(void*) * num_ranks, cudaMemcpyHostToDevice)); + CUDA_CHECK( + cudaMemcpy(peer_rdma_bases_gpu, peer_rdma_bases.data(), sizeof(void*) * num_ranks, cudaMemcpyHostToDevice)); if (rank == 0) { printf("[mscclpp_ep] Phase 4 fabric-IPC peer bases (rank 0):\n"); for (int r = 0; r < num_ranks; ++r) { @@ -1301,10 +1295,8 @@ Buffer::internode_dispatch( buffer_ptrs_gpu, config.num_max_nvl_chunked_recv_tokens, task_fifo_ptrs_gpu, head, rank, comm_stream, config.get_rdma_buffer_size_hint(hidden_int4 * sizeof(int4), num_ranks), num_nvl_bytes, true, low_latency_mode, port_channel_handles_device_ptr.get(), - memory_channel_handles_device_ptr.get(), - nvls_ht_enabled ? nvls_ht_mc_ptr : nullptr, - nvls_ht_enabled ? nvls_ht_dev_ptr : nullptr, - nvls_ht_off_barrier, nvls_ht_cached_epoch); + memory_channel_handles_device_ptr.get(), nvls_ht_enabled ? nvls_ht_mc_ptr : nullptr, + nvls_ht_enabled ? nvls_ht_dev_ptr : nullptr, nvls_ht_off_barrier, nvls_ht_cached_epoch); move_fifo_slots(2); } else { rdma_channel_prefix_matrix = @@ -1329,9 +1321,8 @@ Buffer::internode_dispatch( buffer_ptrs_gpu, config.num_max_nvl_chunked_recv_tokens, task_fifo_ptrs_gpu, head, rank, comm_stream, config.get_rdma_buffer_size_hint(hidden_int4 * sizeof(int4), num_ranks), num_nvl_bytes, low_latency_mode, port_channel_handles_device_ptr.get(), memory_channel_handles_device_ptr.get(), - nvls_ht_enabled ? nvls_ht_mc_ptr : nullptr, - nvls_ht_enabled ? nvls_ht_dev_ptr : nullptr, - nvls_ht_off_barrier, nvls_ht_off_data, nvls_ht_epoch, kNvlsPerPeerBytes); + nvls_ht_enabled ? nvls_ht_mc_ptr : nullptr, nvls_ht_enabled ? nvls_ht_dev_ptr : nullptr, nvls_ht_off_barrier, + nvls_ht_off_data, nvls_ht_epoch, kNvlsPerPeerBytes); move_fifo_slots(3); // Synchronize total received tokens and tokens per expert @@ -1396,10 +1387,14 @@ Buffer::internode_dispatch( // Phase 3: pass NVLS counter region pointers (head/tail × mc/dev). When // `nvls_ht_enabled` is false, all four are nullptr and the kernel falls // back to the legacy PortChannel/atomicAdd path. - void* nvls_head_mc = nvls_ht_enabled ? static_cast(static_cast(nvls_ht_mc_ptr) + nvls_ht_off_head) : nullptr; - void* nvls_head_dev = nvls_ht_enabled ? static_cast(static_cast(nvls_ht_dev_ptr) + nvls_ht_off_head) : nullptr; - void* nvls_tail_mc = nvls_ht_enabled ? static_cast(static_cast(nvls_ht_mc_ptr) + nvls_ht_off_tail) : nullptr; - void* nvls_tail_dev = nvls_ht_enabled ? static_cast(static_cast(nvls_ht_dev_ptr) + nvls_ht_off_tail) : nullptr; + void* nvls_head_mc = + nvls_ht_enabled ? static_cast(static_cast(nvls_ht_mc_ptr) + nvls_ht_off_head) : nullptr; + void* nvls_head_dev = + nvls_ht_enabled ? static_cast(static_cast(nvls_ht_dev_ptr) + nvls_ht_off_head) : nullptr; + void* nvls_tail_mc = + nvls_ht_enabled ? static_cast(static_cast(nvls_ht_mc_ptr) + nvls_ht_off_tail) : nullptr; + void* nvls_tail_dev = + nvls_ht_enabled ? static_cast(static_cast(nvls_ht_dev_ptr) + nvls_ht_off_tail) : nullptr; internode::dispatch(recv_x.data_ptr(), recv_x_scales_ptr, recv_topk_idx_ptr, recv_topk_weights_ptr, cached_mode ? nullptr : recv_src_meta->data_ptr(), x.data_ptr(), x_scales_ptr, topk_idx_ptr, topk_weights_ptr, cached_mode ? nullptr : send_rdma_head->data_ptr(), @@ -1412,9 +1407,8 @@ Buffer::internode_dispatch( rdma_buffer_ptr, config.num_max_rdma_chunked_send_tokens, config.num_max_rdma_chunked_recv_tokens, buffer_ptrs_gpu, config.num_max_nvl_chunked_send_tokens, config.num_max_nvl_chunked_recv_tokens, rank, num_ranks, cached_mode, comm_stream, num_channels, low_latency_mode, - port_channel_handles_device_ptr.get(), memory_channel_handles_device_ptr.get(), - nvls_head_mc, nvls_head_dev, nvls_tail_mc, nvls_tail_dev, - peer_rdma_bases_gpu); + port_channel_handles_device_ptr.get(), memory_channel_handles_device_ptr.get(), nvls_head_mc, + nvls_head_dev, nvls_tail_mc, nvls_tail_dev, peer_rdma_bases_gpu); // Wait streams std::optional event; @@ -1531,28 +1525,30 @@ std::tuple, std::optional(static_cast(nvls_ht_mc_ptr) + nvls_ht_off_head) : nullptr; - void* combine_nvls_head_dev = nvls_ht_enabled ? static_cast(static_cast(nvls_ht_dev_ptr) + nvls_ht_off_head) : nullptr; - void* combine_nvls_tail_mc = nvls_ht_enabled ? static_cast(static_cast(nvls_ht_mc_ptr) + nvls_ht_off_tail) : nullptr; - void* combine_nvls_tail_dev = nvls_ht_enabled ? static_cast(static_cast(nvls_ht_dev_ptr) + nvls_ht_off_tail) : nullptr; - internode::combine(at::cuda::ScalarTypeToCudaDataType(x.scalar_type()), combined_x.data_ptr(), - combined_topk_weights_ptr, is_combined_token_in_rank.data_ptr(), x.data_ptr(), - topk_weights_ptr, combined_rdma_head.data_ptr(), combined_nvl_head.data_ptr(), - src_meta.data_ptr(), rdma_channel_prefix_matrix.data_ptr(), - rdma_rank_prefix_sum.data_ptr(), gbl_channel_prefix_matrix.data_ptr(), num_tokens, - num_combined_tokens, hidden, num_topk, rdma_buffer_ptr, config.num_max_rdma_chunked_send_tokens, - config.num_max_rdma_chunked_recv_tokens, buffer_ptrs_gpu, config.num_max_nvl_chunked_send_tokens, - config.num_max_nvl_chunked_recv_tokens, rank, num_ranks, comm_stream, num_channels, - low_latency_mode, port_channel_handles_device_ptr.get(), memory_channel_handles_device_ptr.get(), - combine_nvls_head_mc, combine_nvls_head_dev, combine_nvls_tail_mc, combine_nvls_tail_dev, - peer_rdma_bases_gpu); + void* combine_nvls_head_mc = + nvls_ht_enabled ? static_cast(static_cast(nvls_ht_mc_ptr) + nvls_ht_off_head) : nullptr; + void* combine_nvls_head_dev = + nvls_ht_enabled ? static_cast(static_cast(nvls_ht_dev_ptr) + nvls_ht_off_head) : nullptr; + void* combine_nvls_tail_mc = + nvls_ht_enabled ? static_cast(static_cast(nvls_ht_mc_ptr) + nvls_ht_off_tail) : nullptr; + void* combine_nvls_tail_dev = + nvls_ht_enabled ? static_cast(static_cast(nvls_ht_dev_ptr) + nvls_ht_off_tail) : nullptr; + internode::combine( + at::cuda::ScalarTypeToCudaDataType(x.scalar_type()), combined_x.data_ptr(), combined_topk_weights_ptr, + is_combined_token_in_rank.data_ptr(), x.data_ptr(), topk_weights_ptr, combined_rdma_head.data_ptr(), + combined_nvl_head.data_ptr(), src_meta.data_ptr(), rdma_channel_prefix_matrix.data_ptr(), + rdma_rank_prefix_sum.data_ptr(), gbl_channel_prefix_matrix.data_ptr(), num_tokens, num_combined_tokens, + hidden, num_topk, rdma_buffer_ptr, config.num_max_rdma_chunked_send_tokens, + config.num_max_rdma_chunked_recv_tokens, buffer_ptrs_gpu, config.num_max_nvl_chunked_send_tokens, + config.num_max_nvl_chunked_recv_tokens, rank, num_ranks, comm_stream, num_channels, low_latency_mode, + port_channel_handles_device_ptr.get(), memory_channel_handles_device_ptr.get(), combine_nvls_head_mc, + combine_nvls_head_dev, combine_nvls_tail_mc, combine_nvls_tail_dev, peer_rdma_bases_gpu); std::optional event; if (async) { diff --git a/src/ext/ep/buffer.hpp b/src/ext/ep/buffer.hpp index 36453af5..e9749847 100644 --- a/src/ext/ep/buffer.hpp +++ b/src/ext/ep/buffer.hpp @@ -155,8 +155,8 @@ struct Buffer { // Worst-case shape parameters used to size the buffer: // stride_per_channel = num_rdma_ranks * num_rdma_ranks (counter slots) // We allocate for `kNvlsMaxChannels` so any `num_sms` config fits. - static constexpr int kNvlsMaxChannels = 64; // num_sms / 2 upper bound - static constexpr int kNvlsPerPeerBytes = 1024; // small-data per (sender, receiver) pair + static constexpr int kNvlsMaxChannels = 64; // num_sms / 2 upper bound + static constexpr int kNvlsPerPeerBytes = 1024; // small-data per (sender, receiver) pair // Number of distinct barrier slots in the barrier sub-region (each u64). static constexpr int kNvlsBarrierSlots = 8; diff --git a/src/ext/ep/kernels/api.cuh b/src/ext/ep/kernels/api.cuh index d728faa9..c47f2422 100644 --- a/src/ext/ep/kernels/api.cuh +++ b/src/ext/ep/kernels/api.cuh @@ -77,10 +77,8 @@ void notify_dispatch(const int* num_tokens_per_rank, int* moe_recv_counter_mappe int num_max_nvl_chunked_recv_tokens, int** task_fifo_ptrs, int head, int rank, cudaStream_t stream, int64_t num_rdma_bytes, int64_t num_nvl_bytes, bool low_latency_mode, mscclpp::PortChannelDeviceHandle* port_channel_handles, - mscclpp::MemoryChannelDeviceHandle* memory_channel_handles, - void* nvls_mc_ptr, void* nvls_dev_ptr, - size_t nvls_off_barrier, size_t nvls_off_data, - uint64_t nvls_epoch, int nvls_per_peer_bytes); + mscclpp::MemoryChannelDeviceHandle* memory_channel_handles, void* nvls_mc_ptr, void* nvls_dev_ptr, + size_t nvls_off_barrier, size_t nvls_off_data, uint64_t nvls_epoch, int nvls_per_peer_bytes); void dispatch(void* recv_x, float* recv_x_scales, int64_t* recv_topk_idx, float* recv_topk_weights, void* recv_src_meta, const void* x, const float* x_scales, const int64_t* topk_idx, const float* topk_weights, @@ -93,9 +91,8 @@ void dispatch(void* recv_x, float* recv_x_scales, int64_t* recv_topk_idx, float* int num_max_nvl_chunked_send_tokens, int num_max_nvl_chunked_recv_tokens, int rank, int num_ranks, bool is_cached_dispatch, cudaStream_t stream, int num_channels, bool low_latency_mode, mscclpp::PortChannelDeviceHandle* port_channel_handles, - mscclpp::MemoryChannelDeviceHandle* memory_channel_handles, - void* nvls_head_mc, void* nvls_head_dev, void* nvls_tail_mc, void* nvls_tail_dev, - void* const* peer_rdma_bases); + mscclpp::MemoryChannelDeviceHandle* memory_channel_handles, void* nvls_head_mc, void* nvls_head_dev, + void* nvls_tail_mc, void* nvls_tail_dev, void* const* peer_rdma_bases); void cached_notify(int hidden_int4, int num_scales, int num_topk_idx, int num_topk_weights, int num_ranks, int num_channels, int num_combined_tokens, int* combined_rdma_head, @@ -104,9 +101,8 @@ void cached_notify(int hidden_int4, int num_scales, int num_topk_idx, int num_to int num_max_nvl_chunked_recv_tokens, int** task_fifo_ptrs, int head, int rank, cudaStream_t stream, int64_t num_rdma_bytes, int64_t num_nvl_bytes, bool is_cached_dispatch, bool low_latency_mode, mscclpp::PortChannelDeviceHandle* port_channel_handles, - mscclpp::MemoryChannelDeviceHandle* memory_channel_handles, - void* nvls_mc_ptr = nullptr, void* nvls_dev_ptr = nullptr, - size_t nvls_off_barrier = 0, uint64_t nvls_epoch = 0); + mscclpp::MemoryChannelDeviceHandle* memory_channel_handles, void* nvls_mc_ptr = nullptr, + void* nvls_dev_ptr = nullptr, size_t nvls_off_barrier = 0, uint64_t nvls_epoch = 0); void combine(cudaDataType_t type, void* combined_x, float* combined_topk_weights, const bool* is_combined_token_in_rank, const void* x, const float* topk_weights, const int* combined_rdma_head, const int* combined_nvl_head, @@ -116,9 +112,8 @@ void combine(cudaDataType_t type, void* combined_x, float* combined_topk_weights void** buffer_ptrs, int num_max_nvl_chunked_send_tokens, int num_max_nvl_chunked_recv_tokens, int rank, int num_ranks, cudaStream_t stream, int num_channels, bool low_latency_mode, mscclpp::PortChannelDeviceHandle* port_channel_handles, - mscclpp::MemoryChannelDeviceHandle* memory_channel_handles, - void* nvls_head_mc, void* nvls_head_dev, void* nvls_tail_mc, void* nvls_tail_dev, - void* const* peer_rdma_bases); + mscclpp::MemoryChannelDeviceHandle* memory_channel_handles, void* nvls_head_mc, void* nvls_head_dev, + void* nvls_tail_mc, void* nvls_tail_dev, void* const* peer_rdma_bases); } // namespace internode diff --git a/src/ext/ep/kernels/buffer.cuh b/src/ext/ep/kernels/buffer.cuh index e006969d..98505c3c 100644 --- a/src/ext/ep/kernels/buffer.cuh +++ b/src/ext/ep/kernels/buffer.cuh @@ -2,9 +2,10 @@ // Licensed under the MIT License. #pragma once +#include + #include "configs.cuh" #include "exception.cuh" -#include namespace mscclpp { namespace ep { diff --git a/src/ext/ep/kernels/internode.cu b/src/ext/ep/kernels/internode.cu index 2b3dea6d..80ffc0f3 100644 --- a/src/ext/ep/kernels/internode.cu +++ b/src/ext/ep/kernels/internode.cu @@ -1,9 +1,9 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT License. #include -#include #include #include +#include #include "buffer.cuh" #include "configs.cuh" @@ -22,8 +22,7 @@ namespace ep { using NvlPackT = std::conditional_t; static_assert(NUM_MAX_NVL_PEERS == 8 || NUM_MAX_NVL_PEERS == 4, "NUM_MAX_NVL_PEERS must be 4 or 8 for HT internode kernel"); -static_assert(NUM_MAX_NVL_PEERS * sizeof(bool) == sizeof(NvlPackT), - "NvlPackT size must match NUM_MAX_NVL_PEERS bools"); +static_assert(NUM_MAX_NVL_PEERS * sizeof(bool) == sizeof(NvlPackT), "NvlPackT size must match NUM_MAX_NVL_PEERS bools"); namespace internode { @@ -273,23 +272,19 @@ __host__ __device__ __forceinline__ std::pair get_nvl_clean_meta(int h } template -__global__ void notify_dispatch(const int* num_tokens_per_rank, int* moe_recv_counter_mapped, int num_ranks, - const int* num_tokens_per_rdma_rank, int* moe_recv_rdma_counter_mapped, - const int* num_tokens_per_expert, int* moe_recv_expert_counter_mapped, int num_experts, - const bool* is_token_in_rank, int num_tokens, int num_channels, int expert_alignment, - const int rdma_clean_offset, const int rdma_num_int_clean, const int nvl_clean_offset, - const int nvl_num_int_clean, int* rdma_channel_prefix_matrix, - int* recv_rdma_rank_prefix_sum, int* gbl_channel_prefix_matrix, - int* recv_gbl_rank_prefix_sum, void* rdma_buffer_ptr, void** buffer_ptrs, - int** task_fifo_ptrs, int head, int rank, - mscclpp::PortChannelDeviceHandle* port_channel_handles, - mscclpp::MemoryChannelDeviceHandle* memory_channel_handles, - // NVLS Phase 2 — replaces port_channel signal/wait + putWithSignal. - // When `nvls_mc_ptr == nullptr` the legacy PortChannel path runs - // unchanged (fallback for non-NVLS IB platforms). - void* nvls_mc_ptr, void* nvls_dev_ptr, - size_t nvls_off_barrier, size_t nvls_off_data, - uint64_t nvls_epoch, int nvls_per_peer_bytes) { +__global__ void notify_dispatch( + const int* num_tokens_per_rank, int* moe_recv_counter_mapped, int num_ranks, const int* num_tokens_per_rdma_rank, + int* moe_recv_rdma_counter_mapped, const int* num_tokens_per_expert, int* moe_recv_expert_counter_mapped, + int num_experts, const bool* is_token_in_rank, int num_tokens, int num_channels, int expert_alignment, + const int rdma_clean_offset, const int rdma_num_int_clean, const int nvl_clean_offset, const int nvl_num_int_clean, + int* rdma_channel_prefix_matrix, int* recv_rdma_rank_prefix_sum, int* gbl_channel_prefix_matrix, + int* recv_gbl_rank_prefix_sum, void* rdma_buffer_ptr, void** buffer_ptrs, int** task_fifo_ptrs, int head, int rank, + mscclpp::PortChannelDeviceHandle* port_channel_handles, mscclpp::MemoryChannelDeviceHandle* memory_channel_handles, + // NVLS Phase 2 — replaces port_channel signal/wait + putWithSignal. + // When `nvls_mc_ptr == nullptr` the legacy PortChannel path runs + // unchanged (fallback for non-NVLS IB platforms). + void* nvls_mc_ptr, void* nvls_dev_ptr, size_t nvls_off_barrier, size_t nvls_off_data, uint64_t nvls_epoch, + int nvls_per_peer_bytes) { auto sm_id = static_cast(blockIdx.x); auto thread_id = static_cast(threadIdx.x), warp_id = thread_id / 32, lane_id = get_lane_id(); auto num_threads = static_cast(blockDim.x), num_warps = num_threads / 32; @@ -397,9 +392,8 @@ __global__ void notify_dispatch(const int* num_tokens_per_rank, int* moe_recv_co // the local send region — contiguous from send_buffer(0). int* src_ints = rdma_recv_num_tokens_mixed.send_buffer(0); const int total_ints = num_elems * kNumRDMARanks; - int* mc_slot = reinterpret_cast( - static_cast(nvls_mc_ptr) + nvls_off_data + - static_cast(my_global_rank) * slot_stride_bytes); + int* mc_slot = reinterpret_cast(static_cast(nvls_mc_ptr) + nvls_off_data + + static_cast(my_global_rank) * slot_stride_bytes); for (int i = thread_id; i < total_ints; i += num_threads) { int val = src_ints[i]; MSCCLPP_EP_MULTIMEM_ST_RELAXED_U32(mc_slot + i, val); @@ -425,10 +419,9 @@ __global__ void notify_dispatch(const int* num_tokens_per_rank, int* moe_recv_co // the legacy SymBuffer recv_buffer(s) location. for (int s = 0; s < kNumRDMARanks; ++s) { const int sender_global = kLowLatencyMode ? s : (s * NUM_MAX_NVL_PEERS + nvl_rank); - const int* nvls_src = reinterpret_cast( - static_cast(nvls_dev_ptr) + nvls_off_data + - static_cast(sender_global) * slot_stride_bytes + - static_cast(rdma_rank) * num_bytes); + const int* nvls_src = reinterpret_cast(static_cast(nvls_dev_ptr) + nvls_off_data + + static_cast(sender_global) * slot_stride_bytes + + static_cast(rdma_rank) * num_bytes); int* dst = rdma_recv_num_tokens_mixed.recv_buffer(s); for (int i = thread_id; i < num_elems; i += num_threads) { dst[i] = nvls_src[i]; @@ -622,10 +615,8 @@ void notify_dispatch(const int* num_tokens_per_rank, int* moe_recv_counter_mappe int num_max_nvl_chunked_recv_tokens, int** task_fifo_ptrs, int head, int rank, cudaStream_t stream, int64_t num_rdma_bytes, int64_t num_nvl_bytes, bool low_latency_mode, mscclpp::PortChannelDeviceHandle* port_channel_handles, - mscclpp::MemoryChannelDeviceHandle* memory_channel_handles, - void* nvls_mc_ptr, void* nvls_dev_ptr, - size_t nvls_off_barrier, size_t nvls_off_data, - uint64_t nvls_epoch, int nvls_per_peer_bytes) { + mscclpp::MemoryChannelDeviceHandle* memory_channel_handles, void* nvls_mc_ptr, void* nvls_dev_ptr, + size_t nvls_off_barrier, size_t nvls_off_data, uint64_t nvls_epoch, int nvls_per_peer_bytes) { #define NOTIFY_DISPATCH_LAUNCH_CASE(num_rdma_ranks) \ { \ auto notify_dispatch_func = \ @@ -636,8 +627,8 @@ void notify_dispatch(const int* num_tokens_per_rank, int* moe_recv_counter_mappe expert_alignment, rdma_clean_meta.first, rdma_clean_meta.second, nvl_clean_meta.first, \ nvl_clean_meta.second, rdma_channel_prefix_matrix, recv_rdma_rank_prefix_sum, \ gbl_channel_prefix_matrix, recv_gbl_rank_prefix_sum, rdma_buffer_ptr, buffer_ptrs, task_fifo_ptrs, \ - head, rank, port_channel_handles, memory_channel_handles, \ - nvls_mc_ptr, nvls_dev_ptr, nvls_off_barrier, nvls_off_data, nvls_epoch, nvls_per_peer_bytes); \ + head, rank, port_channel_handles, memory_channel_handles, nvls_mc_ptr, nvls_dev_ptr, \ + nvls_off_barrier, nvls_off_data, nvls_epoch, nvls_per_peer_bytes); \ } \ break @@ -856,10 +847,8 @@ __global__ void __launch_bounds__(((kNumDispatchRDMASenderWarps + 1 + NUM_MAX_NV const auto dst_offset = rdma_rank * num_bytes + meta_recv_offset; const auto src_offset = dst_rdma_rank * num_bytes + meta_send_offset; const int dst_rank_global = dst_rdma_rank * NUM_MAX_NVL_PEERS + nvl_rank; - const int* src_p = reinterpret_cast( - reinterpret_cast(rdma_buffer_ptr_base) + src_offset); - int* dst_p = reinterpret_cast( - reinterpret_cast(peer_rdma_bases[dst_rank_global]) + dst_offset); + const int* src_p = reinterpret_cast(reinterpret_cast(rdma_buffer_ptr_base) + src_offset); + int* dst_p = reinterpret_cast(reinterpret_cast(peer_rdma_bases[dst_rank_global]) + dst_offset); const int n_int = (int)(num_bytes / sizeof(int)); for (int k = lane_id; k < n_int; k += 32) { dst_p[k] = src_p[k]; @@ -904,8 +893,7 @@ __global__ void __launch_bounds__(((kNumDispatchRDMASenderWarps + 1 + NUM_MAX_NV // Phase 4: head feedback path \u2014 cross-node uses fabric-VA store, // self-loop uses local atomic. Both end up in rdma_channel_head; // single read via ld_volatile_global covers them. NVLS removed. - cached_rdma_channel_head = - static_cast(ld_volatile_global(rdma_channel_head.buffer(lane_id))); + cached_rdma_channel_head = static_cast(ld_volatile_global(rdma_channel_head.buffer(lane_id))); } } __syncwarp(); @@ -991,8 +979,7 @@ __global__ void __launch_bounds__(((kNumDispatchRDMASenderWarps + 1 + NUM_MAX_NV __syncwarp(); // Update last token tail (epilogue). See in-loop note on atomicMax. - if (last_rdma_tail_idx >= 0) - atomicMax(const_cast(rdma_send_channel_tail + lane_id), last_rdma_tail_idx + 1); + if (last_rdma_tail_idx >= 0) atomicMax(const_cast(rdma_send_channel_tail + lane_id), last_rdma_tail_idx + 1); // Release sequential lock lane_id == 0 ? (rdma_send_next_token_idx += 1) : 0; @@ -1035,8 +1022,7 @@ __global__ void __launch_bounds__(((kNumDispatchRDMASenderWarps + 1 + NUM_MAX_NV if (owner && num_tokens_to_send > 0) { auto processed_tail = ld_acquire_cta(const_cast(rdma_send_channel_tail + dst_rdma_rank)); auto num_tokens_processed = processed_tail - last_issued_tail; - if (num_tokens_processed == num_tokens_to_send || - num_tokens_processed >= num_max_rdma_chunked_send_tokens) { + if (num_tokens_processed == num_tokens_to_send || num_tokens_processed >= num_max_rdma_chunked_send_tokens) { int n = min(num_tokens_processed, num_max_rdma_chunked_send_tokens); EP_DEVICE_ASSERT(n >= 0 && n <= num_tokens_to_send); my_num_tokens_to_issue = n; @@ -1067,10 +1053,10 @@ __global__ void __launch_bounds__(((kNumDispatchRDMASenderWarps + 1 + NUM_MAX_NV // ordering pair (release semantics on multimem.red triggers // unspecified launch failure on Azure GB200). const int dst_rank_global = dst_rdma_rank * NUM_MAX_NVL_PEERS + nvl_rank; - const int4* src_p = reinterpret_cast( - reinterpret_cast(rdma_buffer_ptr_base) + src_offset); - int4* dst_p = reinterpret_cast( - reinterpret_cast(peer_rdma_bases[dst_rank_global]) + dst_offset); + const int4* src_p = + reinterpret_cast(reinterpret_cast(rdma_buffer_ptr_base) + src_offset); + int4* dst_p = + reinterpret_cast(reinterpret_cast(peer_rdma_bases[dst_rank_global]) + dst_offset); const int n_int4 = (int)(num_bytes_per_msg / sizeof(int4)); // Unrolled 8x to give the LSU pipeline more outstanding stores // per lane. Each lane handles k, k+32, ..., k+224 per iter @@ -1120,26 +1106,24 @@ __global__ void __launch_bounds__(((kNumDispatchRDMASenderWarps + 1 + NUM_MAX_NV if (owner) { if (peer_rdma_bases != nullptr && dst_rdma_rank != rdma_rank) { const int dst_rank_global = dst_rdma_rank * NUM_MAX_NVL_PEERS + nvl_rank; - const uintptr_t my_tail_off = - reinterpret_cast(rdma_channel_tail.buffer(rdma_rank)) - - reinterpret_cast(rdma_buffer_ptr_base); + const uintptr_t my_tail_off = reinterpret_cast(rdma_channel_tail.buffer(rdma_rank)) - + reinterpret_cast(rdma_buffer_ptr_base); uint64_t* peer_tail = reinterpret_cast( reinterpret_cast(peer_rdma_bases[dst_rank_global]) + my_tail_off); const uint64_t new_tail = (uint64_t)issue_tail + (uint64_t)n_issue; - asm volatile("st.release.sys.global.u64 [%0], %1;" :: "l"(peer_tail), "l"(new_tail) : "memory"); + asm volatile("st.release.sys.global.u64 [%0], %1;" ::"l"(peer_tail), "l"(new_tail) : "memory"); } else { // Self-loop: plain release atomic on local slot (no multicast). - mscclpp::atomicFetchAdd( - reinterpret_cast(rdma_channel_tail.buffer(rdma_rank)), - (uint64_t)n_issue, mscclpp::memoryOrderRelease); + mscclpp::atomicFetchAdd(reinterpret_cast(rdma_channel_tail.buffer(rdma_rank)), + (uint64_t)n_issue, mscclpp::memoryOrderRelease); } } } else if (owner) { // Legacy non-NVLS path (single-lane). if (dst_rdma_rank == rdma_rank) { // Update tails - mscclpp::atomicFetchAdd(reinterpret_cast(rdma_channel_tail.buffer(rdma_rank)), - (uint64_t)n_issue, mscclpp::memoryOrderRelease); + mscclpp::atomicFetchAdd(reinterpret_cast(rdma_channel_tail.buffer(rdma_rank)), (uint64_t)n_issue, + mscclpp::memoryOrderRelease); } else { const auto dst_slot_idx = issue_tail % num_max_rdma_chunked_recv_tokens; const size_t num_bytes_per_msg = (size_t)num_bytes_per_rdma_token * (size_t)n_issue; @@ -1147,9 +1131,9 @@ __global__ void __launch_bounds__(((kNumDispatchRDMASenderWarps + 1 + NUM_MAX_NV dst_slot_idx * num_bytes_per_rdma_token + data_recv_offset; const auto src_offset = dst_rdma_rank * (num_max_rdma_chunked_recv_tokens * num_bytes_per_rdma_token) + dst_slot_idx * num_bytes_per_rdma_token + data_send_offset; - const auto port_channel_idx = - kLowLatencyMode ? (channel_id * kNumRDMARanks + dst_rdma_rank) - : (channel_id * num_ranks + dst_rdma_rank * NUM_MAX_NVL_PEERS + nvl_rank); + const auto port_channel_idx = kLowLatencyMode + ? (channel_id * kNumRDMARanks + dst_rdma_rank) + : (channel_id * num_ranks + dst_rdma_rank * NUM_MAX_NVL_PEERS + nvl_rank); auto& handle = port_channel_handles[port_channel_idx]; handle.put(dst_offset, src_offset, num_bytes_per_msg); @@ -1159,9 +1143,8 @@ __global__ void __launch_bounds__(((kNumDispatchRDMASenderWarps + 1 + NUM_MAX_NV const uint64_t new_tail = (uint64_t)issue_tail + (uint64_t)n_issue; *rdma_channel_tail_send_src.buffer(dst_rdma_rank) = new_tail; __threadfence_system(); - const auto src_off_tail = - reinterpret_cast(rdma_channel_tail_send_src.buffer(dst_rdma_rank)) - - reinterpret_cast(rdma_buffer_ptr_base); + const auto src_off_tail = reinterpret_cast(rdma_channel_tail_send_src.buffer(dst_rdma_rank)) - + reinterpret_cast(rdma_buffer_ptr_base); handle.put(rdma_rank * sizeof(uint64_t) + tail_send_offset, src_off_tail, sizeof(uint64_t)); } } @@ -1271,8 +1254,7 @@ __global__ void __launch_bounds__(((kNumDispatchRDMASenderWarps + 1 + NUM_MAX_NV // (sender writes peer's rdma_channel_tail slot). Self-loop tail // is a plain local atomic. Both end up in rdma_channel_tail — // single read path via ld_acquire_sys_global covers them. - cached_rdma_channel_tail = - static_cast(ld_acquire_sys_global(rdma_channel_tail.buffer(src_rdma_rank))); + cached_rdma_channel_tail = static_cast(ld_acquire_sys_global(rdma_channel_tail.buffer(src_rdma_rank))); } if (__shfl_sync(0xffffffff, cached_rdma_channel_tail > cached_rdma_channel_head, src_rdma_rank)) break; } @@ -1394,9 +1376,8 @@ __global__ void __launch_bounds__(((kNumDispatchRDMASenderWarps + 1 + NUM_MAX_NV // last chunks to never trigger head feedback, which deadlocked at // larger chunk_send values where 4096 tokens / 10 channels / 2 peers // ≈ 205 tokens ⇒ 3 full chunks + 1 partial. - const bool any_retired = forward_channel_retired[0] || forward_channel_retired[1] || - forward_channel_retired[2] || forward_channel_retired[3] || - forward_channel_retired[4] || forward_channel_retired[5] || + const bool any_retired = forward_channel_retired[0] || forward_channel_retired[1] || forward_channel_retired[2] || + forward_channel_retired[3] || forward_channel_retired[4] || forward_channel_retired[5] || forward_channel_retired[6] || forward_channel_retired[7]; const int head_update_threshold = any_retired ? 1 : max(1, num_max_rdma_chunked_send_tokens / 4); if (min_head != std::numeric_limits::max() and min_head >= last_head + head_update_threshold and @@ -1408,12 +1389,11 @@ __global__ void __launch_bounds__(((kNumDispatchRDMASenderWarps + 1 + NUM_MAX_NV // is `peer.rdma_channel_head.buffer(my_rdma_rank)`). Bypasses // both broken port_channel.put and the unreliable NVLS counter. const int dst_rank_global = lane_id * NUM_MAX_NVL_PEERS + nvl_rank; - const uintptr_t my_head_off = - reinterpret_cast(rdma_channel_head.buffer(rdma_rank)) - - reinterpret_cast(rdma_buffer_ptr_base); - uint64_t* peer_head = reinterpret_cast( - reinterpret_cast(peer_rdma_bases[dst_rank_global]) + my_head_off); - asm volatile("st.release.sys.global.u64 [%0], %1;" :: "l"(peer_head), "l"((uint64_t)min_head) : "memory"); + const uintptr_t my_head_off = reinterpret_cast(rdma_channel_head.buffer(rdma_rank)) - + reinterpret_cast(rdma_buffer_ptr_base); + uint64_t* peer_head = + reinterpret_cast(reinterpret_cast(peer_rdma_bases[dst_rank_global]) + my_head_off); + asm volatile("st.release.sys.global.u64 [%0], %1;" ::"l"(peer_head), "l"((uint64_t)min_head) : "memory"); } else if (lane_id == rdma_rank) { // Self-loop: plain release atomic on local slot. Cannot use NVLS // multimem here \u2014 it fans out to all NVL peers' local buffers @@ -1428,9 +1408,8 @@ __global__ void __launch_bounds__(((kNumDispatchRDMASenderWarps + 1 + NUM_MAX_NV // Absolute-value RDMA WRITE replaces broken HW atomicAdd (see note above). *rdma_channel_head_send_src.buffer(lane_id) = (uint64_t)min_head; __threadfence_system(); - const auto src_off_head = - reinterpret_cast(rdma_channel_head_send_src.buffer(lane_id)) - - reinterpret_cast(rdma_buffer_ptr_base); + const auto src_off_head = reinterpret_cast(rdma_channel_head_send_src.buffer(lane_id)) - + reinterpret_cast(rdma_buffer_ptr_base); handle.put(dst_offset, src_off_head, sizeof(uint64_t)); } last_head = min_head; @@ -1548,9 +1527,8 @@ void dispatch(void* recv_x, float* recv_x_scales, int64_t* recv_topk_idx, float* int num_max_nvl_chunked_send_tokens, int num_max_nvl_chunked_recv_tokens, int rank, int num_ranks, bool is_cached_dispatch, cudaStream_t stream, int num_channels, bool low_latency_mode, mscclpp::PortChannelDeviceHandle* port_channel_handles, - mscclpp::MemoryChannelDeviceHandle* memory_channel_handles, - void* nvls_head_mc, void* nvls_head_dev, void* nvls_tail_mc, void* nvls_tail_dev, - void* const* peer_rdma_bases) { + mscclpp::MemoryChannelDeviceHandle* memory_channel_handles, void* nvls_head_mc, void* nvls_head_dev, + void* nvls_tail_mc, void* nvls_tail_dev, void* const* peer_rdma_bases) { constexpr int kNumDispatchRDMASenderWarps = 7; #define DISPATCH_LAUNCH_CASE(num_rdma_ranks) \ @@ -1591,8 +1569,7 @@ __global__ void cached_notify(const int rdma_clean_offset, const int rdma_num_in // Phase 4: NVLS multimem barrier path — bypasses port_channel signal/wait // (broken on Azure CX-7 RoCE) by using multimem.red.add + ld.acquire on a // pair of barrier slots (offsets +24 and +32). Both nullptr ⇒ fall back to IB. - void* nvls_mc_ptr, void* nvls_dev_ptr, size_t nvls_off_barrier, - uint64_t nvls_epoch) { + void* nvls_mc_ptr, void* nvls_dev_ptr, size_t nvls_off_barrier, uint64_t nvls_epoch) { auto sm_id = static_cast(blockIdx.x); auto thread_id = static_cast(threadIdx.x); auto num_threads = static_cast(blockDim.x); @@ -1746,8 +1723,8 @@ void cached_notify(int hidden_int4, int num_scales, int num_topk_idx, int num_to int num_max_nvl_chunked_recv_tokens, int** task_fifo_ptrs, int head, int rank, cudaStream_t stream, int64_t num_rdma_bytes, int64_t num_nvl_bytes, bool is_cached_dispatch, bool low_latency_mode, mscclpp::PortChannelDeviceHandle* port_channel_handles, - mscclpp::MemoryChannelDeviceHandle* memory_channel_handles, - void* nvls_mc_ptr, void* nvls_dev_ptr, size_t nvls_off_barrier, uint64_t nvls_epoch) { + mscclpp::MemoryChannelDeviceHandle* memory_channel_handles, void* nvls_mc_ptr, void* nvls_dev_ptr, + size_t nvls_off_barrier, uint64_t nvls_epoch) { const int num_threads = std::max(128, 32 * num_channels); const auto num_rdma_ranks = num_ranks / NUM_MAX_NVL_PEERS; @@ -1775,8 +1752,8 @@ void cached_notify(int hidden_int4, int num_scales, int num_topk_idx, int num_to LAUNCH_KERNEL(&cfg, cached_notify_func, rdma_clean_meta.first, rdma_clean_meta.second, nvl_clean_meta.first, nvl_clean_meta.second, combined_rdma_head, num_combined_tokens, num_channels, rdma_channel_prefix_matrix, rdma_rank_prefix_sum, combined_nvl_head, rdma_buffer_ptr, buffer_ptrs, - task_fifo_ptrs, head, rank, num_ranks, is_cached_dispatch, port_channel_handles, - memory_channel_handles, nvls_mc_ptr, nvls_dev_ptr, nvls_off_barrier, nvls_epoch); + task_fifo_ptrs, head, rank, num_ranks, is_cached_dispatch, port_channel_handles, memory_channel_handles, + nvls_mc_ptr, nvls_dev_ptr, nvls_off_barrier, nvls_epoch); } template @@ -2118,8 +2095,8 @@ __global__ void __launch_bounds__((NUM_MAX_NVL_PEERS + 1 + kNumForwarders) * 32, // Phase 4: head read — cross-node head feedback comes from peer // via fabric-VA store, self-loop head from local atomic. Both end // up in rdma_channel_head; one read path covers them. - int num_used_slots = token_start_idx - - static_cast(ld_volatile_global(rdma_channel_head.buffer(dst_rdma_rank))); + int num_used_slots = + token_start_idx - static_cast(ld_volatile_global(rdma_channel_head.buffer(dst_rdma_rank))); if (num_max_rdma_chunked_recv_tokens - num_used_slots >= num_chunked_tokens) break; // Timeout check @@ -2188,20 +2165,17 @@ __global__ void __launch_bounds__((NUM_MAX_NVL_PEERS + 1 + kNumForwarders) * 32, // is available; otherwise single-lane handle.put + flush. if (dst_rdma_rank != rdma_rank) { const auto rdma_slot_idx = token_start_idx % num_max_rdma_chunked_recv_tokens; - const size_t num_bytes_per_msg = - (size_t)num_chunked_tokens * (size_t)num_bytes_per_rdma_token; - const auto dst_offset = - rdma_rank * (num_max_rdma_chunked_recv_tokens * num_bytes_per_rdma_token) + - rdma_slot_idx * num_bytes_per_rdma_token + data_recv_offset; - const auto src_offset = - dst_rdma_rank * (num_max_rdma_chunked_recv_tokens * num_bytes_per_rdma_token) + - rdma_slot_idx * num_bytes_per_rdma_token + data_send_offset; + const size_t num_bytes_per_msg = (size_t)num_chunked_tokens * (size_t)num_bytes_per_rdma_token; + const auto dst_offset = rdma_rank * (num_max_rdma_chunked_recv_tokens * num_bytes_per_rdma_token) + + rdma_slot_idx * num_bytes_per_rdma_token + data_recv_offset; + const auto src_offset = dst_rdma_rank * (num_max_rdma_chunked_recv_tokens * num_bytes_per_rdma_token) + + rdma_slot_idx * num_bytes_per_rdma_token + data_send_offset; if (peer_rdma_bases != nullptr) { const int dst_rank_global = dst_rdma_rank * NUM_MAX_NVL_PEERS + nvl_rank; - const int4* src_p = reinterpret_cast( - reinterpret_cast(rdma_buffer_ptr_base) + src_offset); - int4* dst_p = reinterpret_cast( - reinterpret_cast(peer_rdma_bases[dst_rank_global]) + dst_offset); + const int4* src_p = + reinterpret_cast(reinterpret_cast(rdma_buffer_ptr_base) + src_offset); + int4* dst_p = + reinterpret_cast(reinterpret_cast(peer_rdma_bases[dst_rank_global]) + dst_offset); const int n_int4 = (int)(num_bytes_per_msg / sizeof(int4)); // Same 8x unroll as dispatch sender for LSU pipelining. const int stride8 = 8 * 32; @@ -2246,18 +2220,16 @@ __global__ void __launch_bounds__((NUM_MAX_NVL_PEERS + 1 + kNumForwarders) * 32, if (dst_rdma_rank != rdma_rank) { if (peer_rdma_bases != nullptr && lane_id == 0) { const int dst_rank_global = dst_rdma_rank * NUM_MAX_NVL_PEERS + nvl_rank; - const uintptr_t my_tail_off = - reinterpret_cast(rdma_channel_tail.buffer(rdma_rank)) - - reinterpret_cast(rdma_buffer_ptr_base); + const uintptr_t my_tail_off = reinterpret_cast(rdma_channel_tail.buffer(rdma_rank)) - + reinterpret_cast(rdma_buffer_ptr_base); uint64_t* peer_tail = reinterpret_cast( reinterpret_cast(peer_rdma_bases[dst_rank_global]) + my_tail_off); const uint64_t new_tail = (uint64_t)(token_start_idx + num_chunked_tokens); - asm volatile("st.release.sys.global.u64 [%0], %1;" :: "l"(peer_tail), "l"(new_tail) : "memory"); + asm volatile("st.release.sys.global.u64 [%0], %1;" ::"l"(peer_tail), "l"(new_tail) : "memory"); } } else if (lane_id == 0) { - mscclpp::atomicFetchAdd( - reinterpret_cast(rdma_channel_tail.buffer(rdma_rank)), - (uint64_t)num_chunked_tokens, mscclpp::memoryOrderRelease); + mscclpp::atomicFetchAdd(reinterpret_cast(rdma_channel_tail.buffer(rdma_rank)), + (uint64_t)num_chunked_tokens, mscclpp::memoryOrderRelease); } } else if (lane_id == 0) { if (dst_rdma_rank == rdma_rank) { @@ -2280,9 +2252,8 @@ __global__ void __launch_bounds__((NUM_MAX_NVL_PEERS + 1 + kNumForwarders) * 32, const uint64_t new_tail = (uint64_t)(token_start_idx + num_chunked_tokens); *rdma_channel_tail_send_src.buffer(dst_rdma_rank) = new_tail; __threadfence_system(); - const auto src_off_tail = - reinterpret_cast(rdma_channel_tail_send_src.buffer(dst_rdma_rank)) - - reinterpret_cast(rdma_buffer_ptr_base); + const auto src_off_tail = reinterpret_cast(rdma_channel_tail_send_src.buffer(dst_rdma_rank)) - + reinterpret_cast(rdma_buffer_ptr_base); handle.put(rdma_rank * sizeof(uint64_t) + tail_send_offset, src_off_tail, sizeof(uint64_t)); } } @@ -2388,12 +2359,11 @@ __global__ void __launch_bounds__((NUM_MAX_NVL_PEERS + 1 + kNumForwarders) * 32, // self-loop → local atomicAdd if (peer_rdma_bases != nullptr && dst_rdma_rank != rdma_rank) { const int dst_rank_global = dst_rdma_rank * NUM_MAX_NVL_PEERS + nvl_rank; - const uintptr_t my_head_off = - reinterpret_cast(rdma_channel_head.buffer(rdma_rank)) - - reinterpret_cast(rdma_buffer_ptr_base); + const uintptr_t my_head_off = reinterpret_cast(rdma_channel_head.buffer(rdma_rank)) - + reinterpret_cast(rdma_buffer_ptr_base); uint64_t* peer_head = reinterpret_cast( reinterpret_cast(peer_rdma_bases[dst_rank_global]) + my_head_off); - asm volatile("st.release.sys.global.u64 [%0], %1;" :: "l"(peer_head), "l"((uint64_t)min_head) : "memory"); + asm volatile("st.release.sys.global.u64 [%0], %1;" ::"l"(peer_head), "l"((uint64_t)min_head) : "memory"); } else if (dst_rdma_rank == rdma_rank) { mscclpp::atomicFetchAdd(static_cast(rdma_channel_head.buffer(rdma_rank)), (uint64_t)(min_head - last_rdma_head), mscclpp::memoryOrderRelease); @@ -2406,9 +2376,8 @@ __global__ void __launch_bounds__((NUM_MAX_NVL_PEERS + 1 + kNumForwarders) * 32, // Absolute-value RDMA WRITE replaces broken HW atomicAdd. *rdma_channel_head_send_src.buffer(dst_rdma_rank) = (uint64_t)min_head; __threadfence_system(); - const auto src_off_head = - reinterpret_cast(rdma_channel_head_send_src.buffer(dst_rdma_rank)) - - reinterpret_cast(rdma_buffer_ptr_base); + const auto src_off_head = reinterpret_cast(rdma_channel_head_send_src.buffer(dst_rdma_rank)) - + reinterpret_cast(rdma_buffer_ptr_base); handle.put(dst_offset, src_off_head, sizeof(uint64_t)); } last_rdma_head = min_head; @@ -2443,9 +2412,8 @@ void combine(cudaDataType_t type, void* combined_x, float* combined_topk_weights void** buffer_ptrs, int num_max_nvl_chunked_send_tokens, int num_max_nvl_chunked_recv_tokens, int rank, int num_ranks, cudaStream_t stream, int num_channels, bool low_latency_mode, mscclpp::PortChannelDeviceHandle* port_channel_handles, - mscclpp::MemoryChannelDeviceHandle* memory_channel_handles, - void* nvls_head_mc, void* nvls_head_dev, void* nvls_tail_mc, void* nvls_tail_dev, - void* const* peer_rdma_bases) { + mscclpp::MemoryChannelDeviceHandle* memory_channel_handles, void* nvls_head_mc, void* nvls_head_dev, + void* nvls_tail_mc, void* nvls_tail_dev, void* const* peer_rdma_bases) { constexpr int kNumCombineForwarderWarps = 16; #define COMBINE_LAUNCH_CASE(num_rdma_ranks) \ @@ -2458,8 +2426,8 @@ void combine(cudaDataType_t type, void* combined_x, float* combined_topk_weights rdma_rank_prefix_sum, gbl_channel_prefix_matrix, num_tokens, num_combined_tokens, hidden, num_topk, \ rdma_buffer_ptr, num_max_rdma_chunked_send_tokens, num_max_rdma_chunked_recv_tokens, buffer_ptrs, \ num_max_nvl_chunked_send_tokens, num_max_nvl_chunked_recv_tokens, rank, num_ranks, \ - port_channel_handles, memory_channel_handles, \ - nvls_head_mc, nvls_head_dev, nvls_tail_mc, nvls_tail_dev, peer_rdma_bases); \ + port_channel_handles, memory_channel_handles, nvls_head_mc, nvls_head_dev, nvls_tail_mc, \ + nvls_tail_dev, peer_rdma_bases); \ } \ break diff --git a/src/ext/ep/kernels/internode_ll.cu b/src/ext/ep/kernels/internode_ll.cu index e4c3781c..827e2ee8 100644 --- a/src/ext/ep/kernels/internode_ll.cu +++ b/src/ext/ep/kernels/internode_ll.cu @@ -431,16 +431,13 @@ void dispatch(void* packed_recv_x, float* packed_recv_x_scales, int* packed_recv cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, cur_dev); int max_active_narrow = 0; cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &max_active_narrow, - (void*)dispatch, + &max_active_narrow, (void*)dispatch, kNumWarpGroupsIpc * kNumWarpsPerGroupIpc * 32, 0); const int cap_narrow = max_active_narrow * sm_count; ipc_wide = (num_experts > cap_narrow); } - const int kNumWarpGroups = - (use_ipc_path && !ipc_wide) ? kNumWarpGroupsIpc : kNumWarpGroupsRdma; - const int kNumWarpsPerGroup = - (use_ipc_path && !ipc_wide) ? kNumWarpsPerGroupIpc : kNumWarpsPerGroupRdma; + const int kNumWarpGroups = (use_ipc_path && !ipc_wide) ? kNumWarpGroupsIpc : kNumWarpGroupsRdma; + const int kNumWarpsPerGroup = (use_ipc_path && !ipc_wide) ? kNumWarpsPerGroupIpc : kNumWarpsPerGroupRdma; const auto num_warps = kNumWarpGroups * kNumWarpsPerGroup; const auto num_sms_base = cell_div(num_experts, kNumWarpGroups); // LL dispatch/combine are latency-bound at typical problem sizes: for @@ -688,16 +685,13 @@ void combine(void* combined_x, void* rdma_recv_x, int64_t* rdma_recv_flag, void* cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, cur_dev); int max_active_narrow = 0; cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &max_active_narrow, - (void*)combine, + &max_active_narrow, (void*)combine, kNumWarpGroupsIpc * kNumWarpsPerGroupIpc * 32, 0); const int cap_narrow = max_active_narrow * sm_count; ipc_wide = (num_experts > cap_narrow); } - const int kNumWarpGroups = - (use_ipc_path && !ipc_wide) ? kNumWarpGroupsIpc : kNumWarpGroupsRdma; - const int kNumWarpsPerGroup = - (use_ipc_path && !ipc_wide) ? kNumWarpsPerGroupIpc : kNumWarpsPerGroupRdma; + const int kNumWarpGroups = (use_ipc_path && !ipc_wide) ? kNumWarpGroupsIpc : kNumWarpGroupsRdma; + const int kNumWarpsPerGroup = (use_ipc_path && !ipc_wide) ? kNumWarpsPerGroupIpc : kNumWarpsPerGroupRdma; const auto num_warps = kNumWarpGroups * kNumWarpsPerGroup; const auto num_sms_base = cell_div(num_experts, kNumWarpGroups); // See the comment in `dispatch()` above: combine-recv's per-token loop diff --git a/src/ext/ep/kernels/utils.cuh b/src/ext/ep/kernels/utils.cuh index fadd2488..ec84a507 100644 --- a/src/ext/ep/kernels/utils.cuh +++ b/src/ext/ep/kernels/utils.cuh @@ -2,10 +2,10 @@ // Licensed under the MIT License. #pragma once +#include #include #include "exception.cuh" -#include #define UNROLLED_WARP_COPY(UNROLL_FACTOR, LANE_ID, N, DST, SRC, LD_FUNC, ST_FUNC) \ { \ diff --git a/test/python/ext/ep/test_internode_multirank.py b/test/python/ext/ep/test_internode_multirank.py index b7ae2e8d..8fec792b 100644 --- a/test/python/ext/ep/test_internode_multirank.py +++ b/test/python/ext/ep/test_internode_multirank.py @@ -96,6 +96,7 @@ def main(): # Small settings for functional check import os as _os + num_tokens = int(_os.environ.get("MSCCLPP_EP_HT_TOKENS", "128")) hidden = int(_os.environ.get("MSCCLPP_EP_HT_HIDDEN", "1024")) num_topk = int(_os.environ.get("MSCCLPP_EP_HT_TOPK", str(min(4, num_ranks)))) @@ -140,7 +141,13 @@ def main(): # Buffer config for internode HT: needs num_rdma_bytes > 0. Size buffers # using max(hidden, bench_hidden) so the optional bench phase fits. - cfg = ep.Config(int(os.environ.get("MSCCLPP_EP_NSM","152")), int(os.environ.get("MSCCLPP_EP_NVL_SEND","8")), int(os.environ.get("MSCCLPP_EP_NVL_RECV","256")), int(os.environ.get("MSCCLPP_EP_RDMA_SEND","16")), int(os.environ.get("MSCCLPP_EP_RDMA_RECV","128"))) + cfg = ep.Config( + int(os.environ.get("MSCCLPP_EP_NSM", "152")), + int(os.environ.get("MSCCLPP_EP_NVL_SEND", "8")), + int(os.environ.get("MSCCLPP_EP_NVL_RECV", "256")), + int(os.environ.get("MSCCLPP_EP_RDMA_SEND", "16")), + int(os.environ.get("MSCCLPP_EP_RDMA_RECV", "128")), + ) _bench_on = os.environ.get("MSCCLPP_EP_BENCH", "0") == "1" _buf_hidden = max(hidden, int(os.environ.get("MSCCLPP_EP_BENCH_HIDDEN", "0"))) if _bench_on else hidden num_nvl_bytes = cfg.get_nvl_buffer_size_hint(_buf_hidden * x.element_size(), num_ranks) diff --git a/test/python/ext/ep/test_intranode_multirank.py b/test/python/ext/ep/test_intranode_multirank.py index c6ac77f8..5bb9a929 100644 --- a/test/python/ext/ep/test_intranode_multirank.py +++ b/test/python/ext/ep/test_intranode_multirank.py @@ -106,9 +106,11 @@ def main(): # Allocate Buffer (intranode only: num_rdma_bytes=0). Size the NVL buffer # using max(hidden, bench_hidden) so the optional bench phase fits. - cfg = ep.Config(int(os.environ.get("MSCCLPP_EP_NUM_SMS", "20")), - int(os.environ.get("MSCCLPP_EP_NVL_SEND", "8")), - int(os.environ.get("MSCCLPP_EP_NVL_RECV", "256"))) + cfg = ep.Config( + int(os.environ.get("MSCCLPP_EP_NUM_SMS", "20")), + int(os.environ.get("MSCCLPP_EP_NVL_SEND", "8")), + int(os.environ.get("MSCCLPP_EP_NVL_RECV", "256")), + ) _bench_on = os.environ.get("MSCCLPP_EP_BENCH", "0") == "1" _buf_hidden = max(hidden, int(os.environ.get("MSCCLPP_EP_BENCH_HIDDEN", "0"))) if _bench_on else hidden num_nvl_bytes = cfg.get_nvl_buffer_size_hint(_buf_hidden * x.element_size(), num_ranks) @@ -304,8 +306,8 @@ def main(): # This matches NCCL-EP's `ep_bench` convention and isolates the on-GPU # dispatch kernel cost from one-time setup overhead. _layout = _dispatch() - _cached_rpm = _layout[5] # rank_prefix_matrix - _cached_cpm = _layout[6] # channel_prefix_matrix + _cached_rpm = _layout[5] # rank_prefix_matrix + _cached_cpm = _layout[6] # channel_prefix_matrix _cached_n = int(_layout[0].size(0)) # num_recv_tokens on this rank def _dispatch_cached():