mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-21 05:19:24 +00:00
Merge remote-tracking branch 'origin/main' into qinghuazhou/expert_parallel_merge_main_test
# Conflicts: # src/core/connection.cc # test/mp_unit/port_channel_tests.cu
This commit is contained in:
@@ -55,6 +55,7 @@ option(MSCCLPP_BUILD_EXT_EP "Build Expert-Parallel (MoE dispatch/combine) extens
|
||||
option(MSCCLPP_USE_CUDA "Use NVIDIA/CUDA." OFF)
|
||||
option(MSCCLPP_USE_ROCM "Use AMD/ROCm." OFF)
|
||||
option(MSCCLPP_USE_IB "Use InfiniBand." ON)
|
||||
option(MSCCLPP_USE_MRC "Enable MRC support" OFF)
|
||||
option(MSCCLPP_BYPASS_GPU_CHECK "Bypass GPU check." OFF)
|
||||
option(MSCCLPP_NPKIT_FLAGS "Set NPKIT flags" OFF)
|
||||
option(MSCCLPP_ENABLE_COVERAGE "Enable code coverage" OFF)
|
||||
|
||||
@@ -126,6 +126,45 @@ $ python -m pip install ".[cuda12,benchmark]"
|
||||
$ python -m pip install ".[cuda12,benchmark,test]"
|
||||
```
|
||||
|
||||
(mrc-support)=
|
||||
## MRC Support
|
||||
|
||||
MSCCL++ supports execution over **Multi-path Reliable Connection (MRC)**, which enables the use of multiple network paths to improve bandwidth utilization and resilience.
|
||||
|
||||
To enable MRC support, you must configure both the **build-time** and **runtime** environments as described below.
|
||||
|
||||
---
|
||||
|
||||
### 1. Install MRC Verbs Shim
|
||||
|
||||
MSCCL++ relies on a custom verbs shim library that intercepts standard `libibverbs` calls and redirects them to an MRC-enabled implementation.
|
||||
|
||||
- Install the [MRC verbs shim library](https://github.com/microsoft/mrc-verbs-shim-lib) on all nodes in the cluster.
|
||||
- Ensure that the underlying system has MRC support enabled.
|
||||
|
||||
---
|
||||
|
||||
### 2. Build MSCCL++ with MRC Enabled
|
||||
|
||||
Enable MRC support during the build by adding the following CMake option:
|
||||
|
||||
```bash
|
||||
-DMSCCLPP_USE_MRC=ON
|
||||
```
|
||||
|
||||
This configures MSCCL++ to use the MRC-enabled verbs layer at runtime.
|
||||
|
||||
### 3. Configure Runtime Environment
|
||||
|
||||
At runtime, you must configure environment variables to override the default RDMA libraries and link against the MRC-enabled stack:
|
||||
|
||||
```bash
|
||||
-x MSCCLPP_IBV_SO=:$MRC-SHIM-HOME/libibverbs.so
|
||||
-x LD_LIBRARY_PATH=$MRC-SHIM-HOME/mrc-header-lib:$LD_LIBRARY_PATH
|
||||
-x VMRC_LIBMRC_SO=/opt/mellanox/doca/lib/aarch64-linux-gnu/libnv_mrc.so"
|
||||
-x VMRC_LIBIBVERBS_SO=/lib/aarch64-linux-gnu/libibverbs.so.1
|
||||
```
|
||||
|
||||
(vscode-dev-container)=
|
||||
## VSCode Dev Container
|
||||
|
||||
|
||||
@@ -683,6 +683,8 @@ class Connection {
|
||||
friend class Semaphore;
|
||||
friend class ProxyService;
|
||||
friend class BaseConnection;
|
||||
friend struct BasePortChannel;
|
||||
friend struct PortChannel;
|
||||
};
|
||||
|
||||
/// SemaphoreStub object only used for constructing Semaphore, not for direct use by the user.
|
||||
|
||||
@@ -29,6 +29,11 @@ class Fifo {
|
||||
/// Remove the head trigger.
|
||||
void pop();
|
||||
|
||||
/// Get the current tail position — the FIFO push-return value of the trigger about to be
|
||||
/// (or currently being) processed by the proxy thread. Monotonically increasing.
|
||||
/// @return The current tail position.
|
||||
uint64_t tail() const;
|
||||
|
||||
/// Get FIFO size.
|
||||
/// @return Number of entries in the FIFO.
|
||||
int size() const;
|
||||
|
||||
@@ -21,7 +21,10 @@ using __bfloat162 = __hip_bfloat162;
|
||||
#if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR >= 6)
|
||||
#include <hip/hip_fp8.h>
|
||||
|
||||
// Create aliases matching CUDA naming convention for cross-platform compatibility
|
||||
// Create aliases matching CUDA naming convention for cross-platform compatibility.
|
||||
// Define __FP8_E4M3_IS_FNUZ__ / __FP8_E5M2_IS_FNUZ__ when the platform-native FP8 is the
|
||||
// "fnuz" variant (no infinities, NaN-only at 0x80, bias differs from OCP). Dispatch layers
|
||||
// use these macros to throw on unsupported variants requested via DataType.
|
||||
#if (HIP_VERSION_MAJOR == 6) || (HIP_VERSION_MAJOR > 6 && HIP_FP8_TYPE_FNUZ && !HIP_FP8_TYPE_OCP)
|
||||
using __fp8_e4m3 = __hip_fp8_e4m3_fnuz;
|
||||
using __fp8_e5m2 = __hip_fp8_e5m2_fnuz;
|
||||
@@ -29,6 +32,8 @@ using __fp8x2_e4m3 = __hip_fp8x2_e4m3_fnuz;
|
||||
using __fp8x2_e5m2 = __hip_fp8x2_e5m2_fnuz;
|
||||
using __fp8x4_e4m3 = __hip_fp8x4_e4m3_fnuz;
|
||||
using __fp8x4_e5m2 = __hip_fp8x4_e5m2_fnuz;
|
||||
#define __FP8_E4M3_IS_FNUZ__
|
||||
#define __FP8_E5M2_IS_FNUZ__
|
||||
#else
|
||||
using __fp8_e4m3 = __hip_fp8_e4m3;
|
||||
using __fp8_e5m2 = __hip_fp8_e5m2;
|
||||
@@ -66,8 +71,8 @@ using __bfloat162 = __nv_bfloat162;
|
||||
|
||||
/// Software float8 with 4 exponent bits, 3 mantissa bits, exponent bias = 15.
|
||||
/// Format (MSB first): [sign:1][exponent:4][mantissa:3]
|
||||
/// No infinities; exp=15 is NaN. Negative zero is NaN (fnuz convention).
|
||||
/// Max finite value: 0.9375, min normal: ~6.1e-5, min subnormal: ~7.6e-6.
|
||||
/// No infinities, no NaN. Encode saturates to ±1.75 (0x7e/0xfe).
|
||||
/// Adapted from the Triton compiler's fp8e4b15 format.
|
||||
struct alignas(1) __fp8_e4m3b15 {
|
||||
uint8_t __x;
|
||||
|
||||
@@ -97,35 +102,15 @@ struct alignas(1) __fp8_e4m3b15 {
|
||||
/// Algorithm: reinterpret fp8 bits into an fp16 bit pattern with exponent shifted by -8,
|
||||
/// then convert fp16 → float32.
|
||||
static MSCCLPP_HOST_DEVICE_INLINE float toFloat(uint8_t bits) {
|
||||
// Handle special values: negative zero (0x80) → NaN, exponent=15 → NaN.
|
||||
uint32_t exp = (bits >> 3) & 0xFu;
|
||||
if (bits == 0x80 || exp == 15) {
|
||||
union {
|
||||
uint32_t u;
|
||||
float f;
|
||||
} nan_val = {0x7FC00000u};
|
||||
return nan_val.f;
|
||||
}
|
||||
if (bits == 0) return 0.0f;
|
||||
|
||||
// Triton-style bit manipulation: fp8 → fp16 → fp32.
|
||||
// fp8 layout: [S:1][E:4][M:3] (bias=15)
|
||||
// fp16 layout: [S:1][E:5][M:10] (bias=15)
|
||||
//
|
||||
// Place fp8 in upper byte of fp16, then right-shift exponent+mantissa by 1
|
||||
// to convert E4 → E5 (both share bias=15). Sign bit stays at bit 15.
|
||||
// Branch-free decode: fp8 → fp16 → fp32, no special-case handling.
|
||||
// Encode saturates to ±1.75, so 0x7f/0xff are never produced.
|
||||
// Refer:
|
||||
// https://github.com/triton-lang/triton/blob/cf34004b8a67d290a962da166f5aa2fc66751326/python/triton/language/extra/cuda/utils.py#L34
|
||||
uint16_t h = (uint16_t)bits << 8; // place fp8 in upper byte of fp16
|
||||
uint16_t sign16 = h & 0x8000u; // extract sign at fp16 position
|
||||
uint16_t nosign = h & 0x7F00u; // exponent + mantissa (no sign)
|
||||
uint16_t fp16_bits = sign16 | (nosign >> 1); // shift exponent right by 1
|
||||
uint16_t fp16_bits = sign16 | (nosign >> 1); // shift exponent right by 1 (E4→E5)
|
||||
|
||||
// For subnormals: when fp8 exponent=0, the above gives fp16 exponent=0
|
||||
// and fp16 mantissa = (fp8_mantissa << 7), which correctly represents
|
||||
// the subnormal fp16 value since both share bias=15.
|
||||
|
||||
// Convert fp16 bits to float via __half (works on host and device, CUDA and HIP).
|
||||
union {
|
||||
uint16_t u;
|
||||
__half h;
|
||||
@@ -139,14 +124,6 @@ struct alignas(1) __fp8_e4m3b15 {
|
||||
/// The key insight is to convert to fp16 first (which shares bias=15 with e4m3b15),
|
||||
/// then pack the fp16 bits back into 8 bits by shifting the exponent left by 1.
|
||||
static MSCCLPP_HOST_DEVICE_INLINE uint8_t fromFloat(float val) {
|
||||
union {
|
||||
float f;
|
||||
uint32_t u;
|
||||
} in = {val};
|
||||
|
||||
// NaN → 0x80 (negative-zero bit pattern = NaN in fnuz).
|
||||
if ((in.u & 0x7F800000u) == 0x7F800000u && (in.u & 0x007FFFFFu) != 0) return 0x80u;
|
||||
|
||||
// Convert float32 → fp16 bits via __half (works on host and device, CUDA and HIP).
|
||||
__half h_val = __float2half_rn(val);
|
||||
union {
|
||||
@@ -155,32 +132,19 @@ struct alignas(1) __fp8_e4m3b15 {
|
||||
} cvt = {h_val};
|
||||
uint16_t fp16_bits = cvt.u;
|
||||
|
||||
// Clamp absolute value to max finite e4m3b15: 0.9375 → fp16 = 0x3B80.
|
||||
// Clamp abs to max encodable value: 1.75 → fp16 = 0x3F00.
|
||||
// Matches Triton: encode saturates, 0x7f/0xff are never produced.
|
||||
uint16_t abs_fp16 = fp16_bits & 0x7FFFu;
|
||||
if (abs_fp16 > 0x3B80u) abs_fp16 = 0x3B80u;
|
||||
if (abs_fp16 > 0x3F00u) abs_fp16 = 0x3F00u;
|
||||
|
||||
// Reconstruct with sign.
|
||||
uint16_t sign16 = fp16_bits & 0x8000u;
|
||||
|
||||
// Triton-style: fp16 → fp8.
|
||||
// fp16 layout: [S:1][E:5][M:10] (bias=15)
|
||||
// fp8 layout: [S:1][E:4][M:3] (bias=15)
|
||||
//
|
||||
// mad.lo.u32 a0, a0, 2, 0x00800080 → (abs_fp16 * 2 + 0x0080)
|
||||
// This shifts left by 1 (undoing the right-shift in decode) and adds rounding bias.
|
||||
// Then: lop3.b32 b0, $1, 0x80008000, a0, 0xea → (sign & 0x8000) | a0
|
||||
// Finally: prmt for byte extraction.
|
||||
//
|
||||
// Simplified for scalar: shift abs_fp16 left by 1, add rounding bias, take upper byte.
|
||||
// fp16 → fp8: shift abs left by 1 (undo decode's right-shift), add rounding bias, take upper byte.
|
||||
uint16_t adjusted = (uint16_t)(abs_fp16 * 2u + 0x0080u);
|
||||
// The upper byte now contains [E:4][M:3][round_bit].
|
||||
// Combine with sign and extract.
|
||||
uint16_t with_sign = sign16 | adjusted;
|
||||
uint8_t result = (uint8_t)(with_sign >> 8);
|
||||
|
||||
// Zero → 0x00 (ensure positive zero, not negative zero which is NaN).
|
||||
if ((result & 0x7Fu) == 0) result = 0x00u;
|
||||
|
||||
return result;
|
||||
}
|
||||
};
|
||||
@@ -199,16 +163,18 @@ namespace mscclpp {
|
||||
|
||||
/// Data types supported by mscclpp operations.
|
||||
enum class DataType {
|
||||
INT32, // 32-bit signed integer.
|
||||
UINT32, // 32-bit unsigned integer.
|
||||
FLOAT16, // IEEE 754 half precision.
|
||||
FLOAT32, // IEEE 754 single precision.
|
||||
BFLOAT16, // bfloat16 precision.
|
||||
FLOAT8_E4M3, // float8 with E4M3 layout.
|
||||
FLOAT8_E5M2, // float8 with E5M2 layout.
|
||||
UINT8, // 8-bit unsigned integer.
|
||||
FLOAT8_E4M3B15, // float8 with E4M3 layout, bias=15 (software, no HW accel).
|
||||
AUTO = 255, // Sentinel: resolve to the input dtype at runtime.
|
||||
INT32, // 32-bit signed integer.
|
||||
UINT32, // 32-bit unsigned integer.
|
||||
FLOAT16, // IEEE 754 half precision.
|
||||
FLOAT32, // IEEE 754 single precision.
|
||||
BFLOAT16, // bfloat16 precision.
|
||||
FLOAT8_E4M3FN, // float8 E4M3, OCP variant (NV; AMD HIP > 6 with OCP enabled).
|
||||
FLOAT8_E4M3FNUZ, // float8 E4M3, fnuz variant (AMD HIP 6, or HIP > 6 with FNUZ enabled).
|
||||
FLOAT8_E5M2, // float8 E5M2, OCP variant (NV; AMD HIP > 6 with OCP enabled).
|
||||
FLOAT8_E5M2FNUZ, // float8 E5M2, fnuz variant (AMD HIP 6, or HIP > 6 with FNUZ enabled).
|
||||
UINT8, // 8-bit unsigned integer.
|
||||
FLOAT8_E4M3B15, // float8 with E4M3 layout, bias=15 (software, no HW accel).
|
||||
AUTO = 255, // Sentinel: resolve to the input dtype at runtime.
|
||||
};
|
||||
|
||||
/// Word array.
|
||||
@@ -1137,11 +1103,11 @@ MSCCLPP_DEVICE_INLINE f8_e4m3b15x2 to<f8_e4m3b15x2, f16x2>(const f16x2& v) {
|
||||
#if defined(MSCCLPP_DEVICE_CUDA)
|
||||
uint32_t in0;
|
||||
asm("mov.b32 %0, %1;" : "=r"(in0) : "r"(*reinterpret_cast<const uint32_t*>(&v)));
|
||||
// Clamp abs to max finite e4m3b15 (0x3B80 = 0.9375 in fp16).
|
||||
// Clamp abs to max encodable e4m3b15 (0x3F00 = 1.75 in fp16).
|
||||
uint32_t lo = in0 & 0xFFFFu, hi = in0 >> 16;
|
||||
uint32_t alo = lo & 0x7FFFu, ahi = hi & 0x7FFFu;
|
||||
alo = alo < 0x3B80u ? alo : 0x3B80u;
|
||||
ahi = ahi < 0x3B80u ? ahi : 0x3B80u;
|
||||
alo = alo < 0x3F00u ? alo : 0x3F00u;
|
||||
ahi = ahi < 0x3F00u ? ahi : 0x3F00u;
|
||||
uint32_t a0 = alo | (ahi << 16);
|
||||
a0 = a0 * 2u + 0x00800080u;
|
||||
uint32_t b0 = a0 | (in0 & 0x80008000u);
|
||||
@@ -1152,7 +1118,7 @@ MSCCLPP_DEVICE_INLINE f8_e4m3b15x2 to<f8_e4m3b15x2, f16x2>(const f16x2& v) {
|
||||
uint32_t in0 = v.words[0];
|
||||
uint32_t abs0 = in0 & 0x7fff7fffu;
|
||||
uint32_t a0;
|
||||
asm volatile("v_pk_min_u16 %0, %1, %2" : "=v"(a0) : "v"(abs0), "v"(0x3B803B80u));
|
||||
asm volatile("v_pk_min_u16 %0, %1, %2" : "=v"(a0) : "v"(abs0), "v"(0x3F003F00u));
|
||||
a0 = a0 * 2u + 0x00800080u;
|
||||
uint32_t b0 = a0 | (in0 & 0x80008000u);
|
||||
uint16_t packed = (uint16_t)(((b0 >> 8) & 0xFFu) | ((b0 >> 16) & 0xFF00u));
|
||||
@@ -1175,8 +1141,8 @@ MSCCLPP_DEVICE_INLINE f8_e4m3b15x4 to<f8_e4m3b15x4, f16x4>(const f16x4& v) {
|
||||
asm("mov.b32 %0, %1;" : "=r"(in1) : "r"(v.words[1]));
|
||||
uint32_t abs0 = in0 & 0x7fff7fffu;
|
||||
uint32_t abs1 = in1 & 0x7fff7fffu;
|
||||
uint32_t a0 = __vminu2(abs0, 0x3B803B80u);
|
||||
uint32_t a1 = __vminu2(abs1, 0x3B803B80u);
|
||||
uint32_t a0 = __vminu2(abs0, 0x3F003F00u);
|
||||
uint32_t a1 = __vminu2(abs1, 0x3F003F00u);
|
||||
a0 = a0 * 2u + 0x00800080u;
|
||||
a1 = a1 * 2u + 0x00800080u;
|
||||
uint32_t b0, b1;
|
||||
@@ -1189,8 +1155,8 @@ MSCCLPP_DEVICE_INLINE f8_e4m3b15x4 to<f8_e4m3b15x4, f16x4>(const f16x4& v) {
|
||||
uint32_t in0 = v.words[0], in1 = v.words[1];
|
||||
uint32_t abs0 = in0 & 0x7fff7fffu, abs1 = in1 & 0x7fff7fffu;
|
||||
uint32_t a0, a1;
|
||||
asm volatile("v_pk_min_u16 %0, %1, %2" : "=v"(a0) : "v"(abs0), "v"(0x3B803B80u));
|
||||
asm volatile("v_pk_min_u16 %0, %1, %2" : "=v"(a1) : "v"(abs1), "v"(0x3B803B80u));
|
||||
asm volatile("v_pk_min_u16 %0, %1, %2" : "=v"(a0) : "v"(abs0), "v"(0x3F003F00u));
|
||||
asm volatile("v_pk_min_u16 %0, %1, %2" : "=v"(a1) : "v"(abs1), "v"(0x3F003F00u));
|
||||
a0 = a0 * 2u + 0x00800080u;
|
||||
a1 = a1 * 2u + 0x00800080u;
|
||||
uint32_t b0 = a0 | (in0 & 0x80008000u);
|
||||
|
||||
@@ -165,6 +165,7 @@ void gpuFreePhysical(void* ptr);
|
||||
void gpuMemcpyAsync(void* dst, const void* src, size_t bytes, cudaStream_t stream,
|
||||
cudaMemcpyKind kind = cudaMemcpyDefault);
|
||||
void gpuMemcpy(void* dst, const void* src, size_t bytes, cudaMemcpyKind kind = cudaMemcpyDefault);
|
||||
void gpuMemset(void* ptr, int value, size_t bytes);
|
||||
|
||||
/// A template function that allocates memory while ensuring that the memory will be freed when the returned object is
|
||||
/// destroyed.
|
||||
@@ -300,6 +301,12 @@ void gpuMemcpy(T* dst, const T* src, size_t nelems, cudaMemcpyKind kind = cudaMe
|
||||
detail::gpuMemcpy(dst, src, nelems * sizeof(T), kind);
|
||||
}
|
||||
|
||||
/// Sets `bytes` of memory at `ptr` to `value` synchronously.
|
||||
/// @param ptr Destination address.
|
||||
/// @param value Value to set (interpreted as unsigned char per CUDA semantics).
|
||||
/// @param bytes Number of bytes to set.
|
||||
inline void gpuMemset(void* ptr, int value, size_t bytes) { detail::gpuMemset(ptr, value, bytes); }
|
||||
|
||||
/// Check if NVLink SHARP (NVLS) is supported.
|
||||
///
|
||||
/// @return True if NVLink SHARP (NVLS) is supported, false otherwise.
|
||||
|
||||
@@ -84,8 +84,12 @@ class ProxyService : public BaseProxyService {
|
||||
std::vector<RegisteredMemory> memories_;
|
||||
std::shared_ptr<Proxy> proxy_;
|
||||
std::unordered_map<std::shared_ptr<BaseConnection>, int> inflightRequests_;
|
||||
// Latest pending TriggerSync FIFO position per connection. Proxy publishes pos+1 to the
|
||||
// connection's gpuFlushDonePos_ when the CQ drains, then erases the entry.
|
||||
std::unordered_map<std::shared_ptr<BaseConnection>, uint64_t> pendingFlushPos_;
|
||||
|
||||
ProxyHandlerResult handleTrigger(ProxyTrigger triggerRaw);
|
||||
void progressFlushes();
|
||||
};
|
||||
|
||||
/// Port channel without specifying source/destination memory regions.
|
||||
|
||||
@@ -17,6 +17,19 @@ using SemaphoreId = uint32_t;
|
||||
/// actual.
|
||||
using MemoryId = uint32_t;
|
||||
|
||||
namespace detail {
|
||||
#if defined(MSCCLPP_DEVICE_COMPILE)
|
||||
/// Wait until the proxy has processed and drained the TriggerSync at FIFO position `fifoPos`.
|
||||
/// The proxy publishes `flushDonePos = latestCompletedPos + 1` when the CQ drains, so the
|
||||
/// wait condition `flushDonePos > fifoPos` is satisfied exactly when our own request has
|
||||
/// been completed. Using the FIFO push position as the wait target couples the wait to the
|
||||
/// FIFO order, avoiding races when multiple GPU threads concurrently flush the same channel.
|
||||
MSCCLPP_DEVICE_INLINE void waitFlush(uint64_t* flushDonePos, uint64_t fifoPos, [[maybe_unused]] int64_t maxSpinCount) {
|
||||
POLL_MAYBE_JAILBREAK((atomicLoad<uint64_t, scopeSystem>(flushDonePos, memoryOrderAcquire) <= fifoPos), maxSpinCount);
|
||||
}
|
||||
#endif // defined(MSCCLPP_DEVICE_COMPILE)
|
||||
} // namespace detail
|
||||
|
||||
struct BasePortChannelDeviceHandle {
|
||||
SemaphoreId semaphoreId_;
|
||||
|
||||
@@ -26,12 +39,16 @@ struct BasePortChannelDeviceHandle {
|
||||
// can produce for and the sole proxy thread consumes it.
|
||||
FifoDeviceHandle fifo_;
|
||||
|
||||
// One past the highest FIFO position with a completed flush on this connection.
|
||||
// Host-pinned: proxy writes after CQ drain, GPU reads in waitFlush().
|
||||
uint64_t* flushDonePos_;
|
||||
|
||||
MSCCLPP_INLINE BasePortChannelDeviceHandle() = default;
|
||||
|
||||
MSCCLPP_HOST_DEVICE_INLINE BasePortChannelDeviceHandle(SemaphoreId semaphoreId,
|
||||
Host2DeviceSemaphoreDeviceHandle semaphore,
|
||||
FifoDeviceHandle fifo)
|
||||
: semaphoreId_(semaphoreId), semaphore_(semaphore), fifo_(fifo) {}
|
||||
FifoDeviceHandle fifo, uint64_t* flushDonePos)
|
||||
: semaphoreId_(semaphoreId), semaphore_(semaphore), fifo_(fifo), flushDonePos_(flushDonePos) {}
|
||||
|
||||
#if defined(MSCCLPP_DEVICE_COMPILE)
|
||||
/// Push a TriggerData to the FIFO.
|
||||
@@ -86,9 +103,9 @@ struct BasePortChannelDeviceHandle {
|
||||
/// @param maxSpinCount The maximum number of spin counts before asserting. Never assert if negative.
|
||||
MSCCLPP_DEVICE_INLINE void putWithSignalAndFlush(MemoryId dstId, uint64_t dstOffset, MemoryId srcId,
|
||||
uint64_t srcOffset, uint64_t size, int64_t maxSpinCount = 1000000) {
|
||||
uint64_t curFifoHead =
|
||||
uint64_t pos =
|
||||
fifo_.push({TriggerData | TriggerFlag | TriggerSync, dstId, dstOffset, srcId, srcOffset, size, semaphoreId_});
|
||||
fifo_.sync(curFifoHead, maxSpinCount);
|
||||
detail::waitFlush(flushDonePos_, pos, maxSpinCount);
|
||||
}
|
||||
|
||||
/// Push a TriggerData, a TriggerFlag, and a TriggerSync at the same time to the FIFO.
|
||||
@@ -105,8 +122,8 @@ struct BasePortChannelDeviceHandle {
|
||||
/// Push a TriggerSync to the FIFO.
|
||||
/// @param maxSpinCount The maximum number of spin counts before asserting. Never assert if negative.
|
||||
MSCCLPP_DEVICE_INLINE void flush(int64_t maxSpinCount = 1000000) {
|
||||
uint64_t curFifoHead = fifo_.push({TriggerSync, 0, 0, 0, 0, 1, semaphoreId_});
|
||||
fifo_.sync(curFifoHead, maxSpinCount);
|
||||
uint64_t pos = fifo_.push({TriggerSync, 0, 0, 0, 0, 1, semaphoreId_});
|
||||
detail::waitFlush(flushDonePos_, pos, maxSpinCount);
|
||||
}
|
||||
|
||||
/// Push an atomic add trigger to the FIFO to perform a remote atomic add on a 64-bit value.
|
||||
@@ -146,8 +163,8 @@ struct PortChannelDeviceHandle : public BasePortChannelDeviceHandle {
|
||||
|
||||
MSCCLPP_HOST_DEVICE_INLINE PortChannelDeviceHandle(SemaphoreId semaphoreId,
|
||||
Host2DeviceSemaphoreDeviceHandle semaphore, FifoDeviceHandle fifo,
|
||||
MemoryId dst, MemoryId src)
|
||||
: BasePortChannelDeviceHandle(semaphoreId, semaphore, fifo), dst_(dst), src_(src) {}
|
||||
MemoryId dst, MemoryId src, uint64_t* flushDonePos)
|
||||
: BasePortChannelDeviceHandle(semaphoreId, semaphore, fifo, flushDonePos), dst_(dst), src_(src) {}
|
||||
|
||||
#if defined(MSCCLPP_DEVICE_COMPILE)
|
||||
/// Push a TriggerData to the FIFO.
|
||||
|
||||
@@ -20,8 +20,9 @@ enum class ProxyHandlerResult {
|
||||
};
|
||||
|
||||
class Proxy;
|
||||
class ProxyService;
|
||||
|
||||
/// Handler function type for proxy.
|
||||
/// Handler function type for proxy. Called once per ready FIFO trigger.
|
||||
using ProxyHandler = std::function<ProxyHandlerResult(ProxyTrigger)>;
|
||||
|
||||
/// Host-side proxy for PortChannels.
|
||||
@@ -54,6 +55,7 @@ class Proxy {
|
||||
std::shared_ptr<Fifo> fifo();
|
||||
|
||||
private:
|
||||
friend class ProxyService;
|
||||
struct Impl;
|
||||
std::unique_ptr<Impl> pimpl_;
|
||||
};
|
||||
|
||||
@@ -46,6 +46,15 @@ std::string getIBDeviceName(Transport ibTransport);
|
||||
/// @return The InfiniBand transport associated with the specified device name.
|
||||
Transport getIBTransportByDeviceName(const std::string& ibDeviceName);
|
||||
|
||||
/// Check whether this process can allocate/import CUDA memory with NVIDIA fabric handles
|
||||
/// (`CU_MEM_HANDLE_TYPE_FABRIC`). Fabric handles enable cross-node `Transport::CudaIpc` on
|
||||
/// MNNVL systems (e.g., GB200 NVL72) when the IMEX service is running. Returns `false` on
|
||||
/// hardware/software stacks without MNNVL+IMEX, in which case `Transport::CudaIpc` is
|
||||
/// restricted to ranks within the same node.
|
||||
///
|
||||
/// @return `true` if fabric handles are usable from this process, `false` otherwise.
|
||||
bool isFabricMemHandleAvailable();
|
||||
|
||||
} // namespace mscclpp
|
||||
|
||||
#endif // MSCCLPP_UTILS_HPP_
|
||||
|
||||
@@ -45,8 +45,10 @@ void register_core(nb::module_& m) {
|
||||
.value("float16", DataType::FLOAT16)
|
||||
.value("float32", DataType::FLOAT32)
|
||||
.value("bfloat16", DataType::BFLOAT16)
|
||||
.value("float8_e4m3", DataType::FLOAT8_E4M3)
|
||||
.value("float8_e4m3fn", DataType::FLOAT8_E4M3FN)
|
||||
.value("float8_e4m3fnuz", DataType::FLOAT8_E4M3FNUZ)
|
||||
.value("float8_e5m2", DataType::FLOAT8_E5M2)
|
||||
.value("float8_e5m2fnuz", DataType::FLOAT8_E5M2FNUZ)
|
||||
.value("uint8", DataType::UINT8)
|
||||
.value("float8_e4m3b15", DataType::FLOAT8_E4M3B15);
|
||||
|
||||
@@ -328,4 +330,4 @@ NB_MODULE(_mscclpp, m) {
|
||||
|
||||
// ext
|
||||
register_algorithm_collection_builder(m);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -13,7 +13,7 @@ from mscclpp.language.utils import AlgoSpec
|
||||
default_algo_configs = [
|
||||
{
|
||||
"filename": "allreduce_2nodes_1K_64K.json",
|
||||
"function": def_algo.allreduce_2nodes,
|
||||
"function": def_algo.allreduce_multi_nodes,
|
||||
"spec": AlgoSpec(
|
||||
name="allreduce_2nodes_1K_64K",
|
||||
collective=AllReduce(16, 1, True),
|
||||
@@ -34,7 +34,7 @@ default_algo_configs = [
|
||||
},
|
||||
{
|
||||
"filename": "allreduce_2nodes_128K_2M.json",
|
||||
"function": def_algo.allreduce_2nodes,
|
||||
"function": def_algo.allreduce_multi_nodes,
|
||||
"spec": AlgoSpec(
|
||||
name="allreduce_2nodes_128K_2M",
|
||||
collective=AllReduce(16, 1, True),
|
||||
@@ -53,6 +53,48 @@ default_algo_configs = [
|
||||
),
|
||||
"additional_kwargs": {"thread_block_group_size": 4},
|
||||
},
|
||||
{
|
||||
"filename": "allreduce_4nodes_1K_64K.json",
|
||||
"function": def_algo.allreduce_multi_nodes,
|
||||
"spec": AlgoSpec(
|
||||
name="allreduce_4nodes_1K_64K",
|
||||
collective=AllReduce(32, 1, True),
|
||||
nranks_per_node=8,
|
||||
world_size=32,
|
||||
in_place=True,
|
||||
instances=1,
|
||||
protocol="LL",
|
||||
auto_sync=False,
|
||||
num_threads_per_block=1024,
|
||||
reuse_resources=True,
|
||||
use_double_scratch_buffer=True,
|
||||
min_message_size=1 << 10,
|
||||
max_message_size=64 << 10,
|
||||
tags={"default": 1},
|
||||
),
|
||||
"additional_kwargs": {"thread_block_group_size": 1},
|
||||
},
|
||||
{
|
||||
"filename": "allreduce_4nodes_128K_2M.json",
|
||||
"function": def_algo.allreduce_multi_nodes,
|
||||
"spec": AlgoSpec(
|
||||
name="allreduce_4nodes_128K_2M",
|
||||
collective=AllReduce(32, 1, True),
|
||||
nranks_per_node=8,
|
||||
world_size=32,
|
||||
in_place=True,
|
||||
instances=1,
|
||||
protocol="LL",
|
||||
auto_sync=False,
|
||||
num_threads_per_block=1024,
|
||||
reuse_resources=True,
|
||||
use_double_scratch_buffer=True,
|
||||
min_message_size=128 << 10,
|
||||
max_message_size=2 << 20,
|
||||
tags={"default": 1},
|
||||
),
|
||||
"additional_kwargs": {"thread_block_group_size": 4},
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
from mscclpp.default_algos.allreduce_2nodes import allreduce_2nodes
|
||||
from mscclpp.default_algos.allreduce_multi_nodes import allreduce_multi_nodes
|
||||
|
||||
__all__ = ["allreduce_2nodes"]
|
||||
__all__ = ["allreduce_multi_nodes"]
|
||||
|
||||
@@ -2,9 +2,11 @@
|
||||
# Licensed under the MIT License.
|
||||
|
||||
"""
|
||||
Multi-node AllReduce implementation using packet-based communication.
|
||||
This implements a hierarchical AllReduce: intra-node allreduce followed by
|
||||
inter-node exchange and final intra-node allreduce.
|
||||
Generalized multi-node AllReduce implementation using packet-based communication.
|
||||
This implements a hierarchical AllReduce for N nodes:
|
||||
1. Intra-node reduce-scatter (each GPU reduces its assigned chunk across the node)
|
||||
2. Inter-node allreduce (exchange fully intra-reduced chunks across all nodes)
|
||||
3. Intra-node broadcast (distribute the fully reduced chunks back to all GPUs in the node)
|
||||
"""
|
||||
|
||||
from mscclpp.language.utils import AlgoSpec
|
||||
@@ -15,7 +17,7 @@ from mscclpp.language.program import *
|
||||
from mscclpp.language.collectives import *
|
||||
|
||||
|
||||
def allreduce_2nodes(spec: AlgoSpec, thread_block_group_size: int) -> CollectiveProgram:
|
||||
def allreduce_multi_nodes(spec: AlgoSpec, thread_block_group_size: int) -> CollectiveProgram:
|
||||
"""
|
||||
Implements a multi-node AllReduce using a hierarchical approach:
|
||||
1. Intra-node allreduce
|
||||
@@ -23,10 +25,10 @@ def allreduce_2nodes(spec: AlgoSpec, thread_block_group_size: int) -> Collective
|
||||
3. Intra-node allreduce
|
||||
"""
|
||||
# Configuration constants
|
||||
num_nodes = 2
|
||||
num_nodes = spec.world_size // spec.nranks_per_node
|
||||
gpus_per_node = spec.nranks_per_node
|
||||
total_gpus = num_nodes * gpus_per_node
|
||||
packets_per_gpu = 2
|
||||
packets_per_gpu = num_nodes
|
||||
|
||||
with CollectiveProgram.from_spec(spec) as prog:
|
||||
# Initialize communication channels and buffers
|
||||
@@ -54,11 +56,21 @@ def allreduce_2nodes(spec: AlgoSpec, thread_block_group_size: int) -> Collective
|
||||
)
|
||||
)
|
||||
|
||||
scratch_buffer_size = packets_per_gpu * (total_gpus + 1)
|
||||
# Scratch buffer layout (3 contiguous regions):
|
||||
# Region 1 [0, total_gpus):
|
||||
# Intra-node reduce-scatter. Each GPU receives chunks from gpus_per_node peers,
|
||||
# packets_per_gpu each → gpus_per_node * packets_per_gpu = total_gpus slots.
|
||||
# Region 2 [total_gpus, total_gpus + num_nodes * packets_per_gpu):
|
||||
# Inter-node exchange. Each GPU receives reduced chunks from num_nodes nodes,
|
||||
# packets_per_gpu each → num_nodes * packets_per_gpu slots.
|
||||
# Region 3 [total_gpus + num_nodes * packets_per_gpu, end):
|
||||
# Intra-node broadcast. Each GPU receives final reduced data from gpus_per_node peers,
|
||||
# packets_per_gpu each → gpus_per_node * packets_per_gpu = total_gpus slots.
|
||||
# Total = 2 * total_gpus + num_nodes * packets_per_gpu
|
||||
scratch_buffer_size = 2 * total_gpus + packets_per_gpu * num_nodes
|
||||
for node_id in range(num_nodes):
|
||||
for local_gpu_id in range(gpus_per_node):
|
||||
current_rank_id = local_gpu_id + gpus_per_node * node_id
|
||||
next_node_rank_id = (local_gpu_id + gpus_per_node * (node_id + 1)) % total_gpus
|
||||
scratch_buffers.append(Buffer(current_rank_id, scratch_buffer_size))
|
||||
for peer_gpu_id in range(gpus_per_node):
|
||||
if peer_gpu_id != local_gpu_id:
|
||||
@@ -66,7 +78,12 @@ def allreduce_2nodes(spec: AlgoSpec, thread_block_group_size: int) -> Collective
|
||||
intra_node_memory_channels[(peer_rank_id, current_rank_id)] = MemoryChannel(
|
||||
peer_rank_id, current_rank_id
|
||||
)
|
||||
inter_node_port_channels[current_rank_id] = PortChannel(next_node_rank_id, current_rank_id)
|
||||
for peer_node_id in range(num_nodes):
|
||||
if peer_node_id != node_id:
|
||||
peer_node_rank_id = (local_gpu_id + gpus_per_node * peer_node_id) % total_gpus
|
||||
inter_node_port_channels[(current_rank_id, peer_node_rank_id)] = PortChannel(
|
||||
peer_node_rank_id, current_rank_id
|
||||
)
|
||||
|
||||
# AllReduce
|
||||
for node_id in range(num_nodes):
|
||||
@@ -74,7 +91,6 @@ def allreduce_2nodes(spec: AlgoSpec, thread_block_group_size: int) -> Collective
|
||||
current_rank_id = local_gpu_id + gpus_per_node * node_id
|
||||
current_rank = Rank(current_rank_id)
|
||||
input_buffer = current_rank.get_input_buffer()
|
||||
next_node_rank_id = (local_gpu_id + gpus_per_node * (node_id + 1)) % total_gpus
|
||||
|
||||
# Intra Node Exchange Data
|
||||
for peer_gpu_id in range(gpus_per_node):
|
||||
@@ -118,27 +134,32 @@ def allreduce_2nodes(spec: AlgoSpec, thread_block_group_size: int) -> Collective
|
||||
)
|
||||
|
||||
inter_node_offset = total_gpus
|
||||
inter_node_port_channels[current_rank_id].put_packets(
|
||||
scratch_buffers[next_node_rank_id][
|
||||
inter_node_offset
|
||||
+ local_gpu_id * packets_per_gpu : inter_node_offset
|
||||
+ local_gpu_id * packets_per_gpu
|
||||
+ packets_per_gpu
|
||||
],
|
||||
scratch_buffers[current_rank_id][
|
||||
local_gpu_id * packets_per_gpu : local_gpu_id * packets_per_gpu + packets_per_gpu
|
||||
],
|
||||
tb=0,
|
||||
)
|
||||
for peer_node_id in range(num_nodes):
|
||||
if peer_node_id != node_id:
|
||||
peer_node_rank_id = (local_gpu_id + gpus_per_node * peer_node_id) % total_gpus
|
||||
inter_node_port_channels[(current_rank_id, peer_node_rank_id)].put_packets(
|
||||
scratch_buffers[peer_node_rank_id][
|
||||
inter_node_offset
|
||||
+ node_id * packets_per_gpu : inter_node_offset
|
||||
+ node_id * packets_per_gpu
|
||||
+ packets_per_gpu
|
||||
],
|
||||
scratch_buffers[current_rank_id][
|
||||
local_gpu_id * packets_per_gpu : local_gpu_id * packets_per_gpu + packets_per_gpu
|
||||
],
|
||||
tb=0,
|
||||
)
|
||||
|
||||
# Reduce Received Data from Remote Node
|
||||
inter_node_data = [
|
||||
scratch_buffers[current_rank_id][
|
||||
inter_node_offset
|
||||
+ local_gpu_id * packets_per_gpu : inter_node_offset
|
||||
+ local_gpu_id * packets_per_gpu
|
||||
+ peer_node_id * packets_per_gpu : inter_node_offset
|
||||
+ peer_node_id * packets_per_gpu
|
||||
+ packets_per_gpu
|
||||
]
|
||||
for peer_node_id in range(num_nodes)
|
||||
if peer_node_id != node_id
|
||||
]
|
||||
current_rank.reduce(
|
||||
input_buffer[local_gpu_id * packets_per_gpu : local_gpu_id * packets_per_gpu + packets_per_gpu],
|
||||
@@ -148,12 +169,18 @@ def allreduce_2nodes(spec: AlgoSpec, thread_block_group_size: int) -> Collective
|
||||
)
|
||||
|
||||
current_rank.copy_packets(
|
||||
scratch_buffers[current_rank_id][scratch_buffer_size - packets_per_gpu : scratch_buffer_size],
|
||||
scratch_buffers[current_rank_id][
|
||||
inter_node_offset
|
||||
+ node_id * packets_per_gpu : inter_node_offset
|
||||
+ node_id * packets_per_gpu
|
||||
+ packets_per_gpu
|
||||
],
|
||||
input_buffer[local_gpu_id * packets_per_gpu : local_gpu_id * packets_per_gpu + packets_per_gpu],
|
||||
tb_group=global_intra_node_tbg,
|
||||
)
|
||||
|
||||
# Broadcast Reduced Data
|
||||
broadcast_offset = total_gpus + packets_per_gpu * num_nodes
|
||||
for peer_gpu_id in range(gpus_per_node):
|
||||
peer_rank_id = peer_gpu_id + gpus_per_node * node_id
|
||||
|
||||
@@ -161,13 +188,16 @@ def allreduce_2nodes(spec: AlgoSpec, thread_block_group_size: int) -> Collective
|
||||
tbg_id = peer_gpu_id if peer_gpu_id < local_gpu_id else peer_gpu_id - 1
|
||||
intra_node_memory_channels[(peer_rank_id, current_rank_id)].read_put_packets(
|
||||
scratch_buffers[peer_rank_id][
|
||||
inter_node_offset
|
||||
+ local_gpu_id * packets_per_gpu : inter_node_offset
|
||||
broadcast_offset
|
||||
+ local_gpu_id * packets_per_gpu : broadcast_offset
|
||||
+ local_gpu_id * packets_per_gpu
|
||||
+ packets_per_gpu
|
||||
],
|
||||
scratch_buffers[current_rank_id][
|
||||
scratch_buffer_size - packets_per_gpu : scratch_buffer_size
|
||||
inter_node_offset
|
||||
+ node_id * packets_per_gpu : inter_node_offset
|
||||
+ node_id * packets_per_gpu
|
||||
+ packets_per_gpu
|
||||
],
|
||||
tb_group=thread_block_groups[tbg_id],
|
||||
)
|
||||
@@ -181,8 +211,8 @@ def allreduce_2nodes(spec: AlgoSpec, thread_block_group_size: int) -> Collective
|
||||
peer_gpu_id * packets_per_gpu : peer_gpu_id * packets_per_gpu + packets_per_gpu
|
||||
],
|
||||
scratch_buffers[current_rank_id][
|
||||
inter_node_offset
|
||||
+ peer_gpu_id * packets_per_gpu : inter_node_offset
|
||||
broadcast_offset
|
||||
+ peer_gpu_id * packets_per_gpu : broadcast_offset
|
||||
+ peer_gpu_id * packets_per_gpu
|
||||
+ packets_per_gpu
|
||||
],
|
||||
@@ -190,3 +220,37 @@ def allreduce_2nodes(spec: AlgoSpec, thread_block_group_size: int) -> Collective
|
||||
)
|
||||
|
||||
return prog
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--name", type=str, help="name of the program")
|
||||
parser.add_argument("--num_gpus", type=int, help="total number of gpus")
|
||||
parser.add_argument("--gpus_per_node", type=int, help="number of gpus per node")
|
||||
parser.add_argument("--tbg", type=int, default=1, help="thread block group size")
|
||||
parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
|
||||
parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
|
||||
parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
spec = AlgoSpec(
|
||||
name=args.name,
|
||||
collective=AllReduce(args.num_gpus, 1, True),
|
||||
nranks_per_node=args.gpus_per_node,
|
||||
world_size=args.num_gpus,
|
||||
in_place=True,
|
||||
instances=1,
|
||||
protocol="LL",
|
||||
auto_sync=False,
|
||||
num_threads_per_block=args.num_threads_per_block,
|
||||
reuse_resources=True,
|
||||
use_double_scratch_buffer=True,
|
||||
min_message_size=args.min_message_size,
|
||||
max_message_size=args.max_message_size,
|
||||
)
|
||||
|
||||
prog = allreduce_multi_nodes(spec, args.tbg)
|
||||
print(prog.to_json())
|
||||
@@ -192,12 +192,14 @@ def torch_dtype_to_mscclpp_dtype(dtype: "torch.dtype") -> DataType:
|
||||
return DataType.int32
|
||||
elif dtype == torch.bfloat16:
|
||||
return DataType.bfloat16
|
||||
# Hardware supports either OCP format or FNUZ format for float8.
|
||||
# Mapping both to the same MSCClPP data type.
|
||||
elif dtype == torch.float8_e5m2 or dtype == torch.float8_e5m2fnuz:
|
||||
elif dtype == torch.float8_e5m2:
|
||||
return DataType.float8_e5m2
|
||||
elif dtype == torch.float8_e4m3fn or dtype == torch.float8_e4m3fnuz:
|
||||
return DataType.float8_e4m3
|
||||
elif dtype == torch.float8_e5m2fnuz:
|
||||
return DataType.float8_e5m2fnuz
|
||||
elif dtype == torch.float8_e4m3fn:
|
||||
return DataType.float8_e4m3fn
|
||||
elif dtype == torch.float8_e4m3fnuz:
|
||||
return DataType.float8_e4m3fnuz
|
||||
elif dtype == torch.uint8:
|
||||
return DataType.uint8
|
||||
else:
|
||||
|
||||
@@ -24,6 +24,8 @@ def parse_dtype(dtype_str):
|
||||
dtype_str = dtype_str.strip().lower()
|
||||
if dtype_str == "float16":
|
||||
return cp.float16
|
||||
elif dtype_str in ("bfloat16", "bf16"):
|
||||
return cp.float16 # same 2-byte size; mscclpp DataType is resolved from dtype_str
|
||||
elif dtype_str == "float32":
|
||||
return cp.float32
|
||||
elif dtype_str == "int32":
|
||||
@@ -119,15 +121,18 @@ def parse_size(size_str):
|
||||
return int(size_str)
|
||||
|
||||
|
||||
def dtype_to_mscclpp_dtype(dtype):
|
||||
if dtype == cp.float16:
|
||||
def dtype_to_mscclpp_dtype(dtype_str):
|
||||
dtype_str = dtype_str.strip().lower()
|
||||
if dtype_str == "float16":
|
||||
return DataType.float16
|
||||
elif dtype == cp.float32:
|
||||
elif dtype_str in ("bfloat16", "bf16"):
|
||||
return DataType.bfloat16
|
||||
elif dtype_str == "float32":
|
||||
return DataType.float32
|
||||
elif dtype == cp.int32:
|
||||
elif dtype_str == "int32":
|
||||
return DataType.int32
|
||||
else:
|
||||
raise ValueError(f"Unknown data type: {dtype}")
|
||||
raise ValueError(f"Unknown data type: {dtype_str}")
|
||||
|
||||
|
||||
def build_bufs(
|
||||
@@ -205,7 +210,7 @@ def main(
|
||||
result_buf.data.ptr,
|
||||
input_buf.nbytes,
|
||||
result_buf.nbytes,
|
||||
dtype_to_mscclpp_dtype(dtype),
|
||||
dtype_to_mscclpp_dtype(dtype_str),
|
||||
execution_plan,
|
||||
stream.ptr,
|
||||
packet_type,
|
||||
@@ -231,7 +236,7 @@ def main(
|
||||
npkit.shutdown()
|
||||
print(
|
||||
f"Rank: {mscclpp_group.my_rank} Execution time: {execution_time} us, "
|
||||
f"data size: {result_buf.nbytes} bytes data type: {dtype().dtype.name} "
|
||||
f"data size: {result_buf.nbytes} bytes data type: {dtype_str} "
|
||||
f"packet type: {packet_type}"
|
||||
)
|
||||
executor = None
|
||||
@@ -243,7 +248,7 @@ if __name__ == "__main__":
|
||||
parser.add_argument("-path", "--execution_plan_path", type=str, required=True)
|
||||
parser.add_argument("--size", type=str, required=True)
|
||||
parser.add_argument("--in_place", action="store_true", help="flag to define an in-place operation")
|
||||
parser.add_argument("--dtype", type=str, default="float16", help="Choose from float16, float32, int32")
|
||||
parser.add_argument("--dtype", type=str, default="float16", help="Choose from float16, bfloat16, float32, int32")
|
||||
parser.add_argument("--packet_type", type=str, default="LL16", help="Choose from LL8, LL16")
|
||||
parser.add_argument("--n_iters", type=int, default=10)
|
||||
parser.add_argument("--n_graph_iters", type=int, default=10)
|
||||
|
||||
@@ -4,8 +4,10 @@
|
||||
#include <assert.h>
|
||||
|
||||
#if defined(__HIP_PLATFORM_AMD__)
|
||||
#include <hip/hip_bfloat16.h>
|
||||
#include <hip/hip_fp16.h>
|
||||
#else
|
||||
#include <cuda_bf16.h>
|
||||
#include <cuda_fp16.h>
|
||||
#endif
|
||||
|
||||
@@ -30,6 +32,7 @@ static __device__ unsigned int ranqd1(unsigned int seed) {
|
||||
} \
|
||||
}
|
||||
|
||||
FILL_DATA(bfloat16, __nv_bfloat16)
|
||||
FILL_DATA(float16, __half)
|
||||
FILL_DATA(float32, float)
|
||||
FILL_DATA(int32, int)
|
||||
@@ -48,11 +51,12 @@ FILL_DATA(int32, int)
|
||||
} \
|
||||
}
|
||||
|
||||
TEST_DATA_ALL_GATHER(bfloat16, __nv_bfloat16)
|
||||
TEST_DATA_ALL_GATHER(float16, __half)
|
||||
TEST_DATA_ALL_GATHER(float32, float)
|
||||
TEST_DATA_ALL_GATHER(int32, int)
|
||||
|
||||
#define TEST_DATA_ALL_REDUCE(FuncNameType, DataType) \
|
||||
#define TEST_DATA_ALL_REDUCE(FuncNameType, DataType, Eps) \
|
||||
extern "C" __global__ void __launch_bounds__(1024, 1) test_data_all_reduce_##FuncNameType( \
|
||||
DataType* result_buf, DataType* test_buf, size_t num_elems, int num_ranks, int my_rank, int seq) { \
|
||||
for (int rank = 0; rank < num_ranks; rank++) { \
|
||||
@@ -66,15 +70,19 @@ TEST_DATA_ALL_GATHER(int32, int)
|
||||
} \
|
||||
} \
|
||||
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < num_elems; i += blockDim.x * gridDim.x) { \
|
||||
assert(abs(float(result_buf[i]) - float(test_buf[i])) < 1e-3 * num_ranks); \
|
||||
float expected = float(test_buf[i]); \
|
||||
float result = float(result_buf[i]); \
|
||||
float tol = Eps * num_ranks * (1.0f + abs(expected)); \
|
||||
assert(abs(result - expected) <= tol); \
|
||||
} \
|
||||
}
|
||||
|
||||
TEST_DATA_ALL_REDUCE(float16, __half)
|
||||
TEST_DATA_ALL_REDUCE(float32, float)
|
||||
TEST_DATA_ALL_REDUCE(int32, int)
|
||||
TEST_DATA_ALL_REDUCE(bfloat16, __nv_bfloat16, 7.8125e-3f)
|
||||
TEST_DATA_ALL_REDUCE(float16, __half, 9.765625e-4f)
|
||||
TEST_DATA_ALL_REDUCE(float32, float, 1.1920929e-7f)
|
||||
TEST_DATA_ALL_REDUCE(int32, int, 0.0f)
|
||||
|
||||
#define TEST_DATA_REDUCE_SCATTER(FuncNameType, DataType) \
|
||||
#define TEST_DATA_REDUCE_SCATTER(FuncNameType, DataType, Eps) \
|
||||
extern "C" __global__ void __launch_bounds__(1024, 1) test_data_reduce_scatter_##FuncNameType( \
|
||||
DataType* result_buf, DataType* test_buf, size_t num_elems, int num_ranks, int my_rank, int seq) { \
|
||||
int nem_elems_per_rank = num_elems / num_ranks; \
|
||||
@@ -91,14 +99,18 @@ TEST_DATA_ALL_REDUCE(int32, int)
|
||||
} \
|
||||
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < num_elems; i += blockDim.x * gridDim.x) { \
|
||||
if (i >= offset && i < offset + nem_elems_per_rank) { \
|
||||
assert(abs(float(result_buf[i - offset]) - float(test_buf[i])) < 1e-3 * num_ranks); \
|
||||
float expected = float(test_buf[i]); \
|
||||
float result = float(result_buf[i - offset]); \
|
||||
float tol = Eps * num_ranks * (1.0f + abs(expected)); \
|
||||
assert(abs(result - expected) <= tol); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
TEST_DATA_REDUCE_SCATTER(float16, __half)
|
||||
TEST_DATA_REDUCE_SCATTER(float32, float)
|
||||
TEST_DATA_REDUCE_SCATTER(int32, int)
|
||||
TEST_DATA_REDUCE_SCATTER(bfloat16, __nv_bfloat16, 7.8125e-3f)
|
||||
TEST_DATA_REDUCE_SCATTER(float16, __half, 9.765625e-4f)
|
||||
TEST_DATA_REDUCE_SCATTER(float32, float, 1.1920929e-7f)
|
||||
TEST_DATA_REDUCE_SCATTER(int32, int, 0.0f)
|
||||
|
||||
#define TEST_DATA_ALL_TO_ALL(FuncNameType, DataType) \
|
||||
extern "C" __global__ void __launch_bounds__(1024, 1) test_data_all_to_all_##FuncNameType( \
|
||||
@@ -118,6 +130,7 @@ TEST_DATA_REDUCE_SCATTER(int32, int)
|
||||
} \
|
||||
}
|
||||
|
||||
TEST_DATA_ALL_TO_ALL(bfloat16, __nv_bfloat16)
|
||||
TEST_DATA_ALL_TO_ALL(float16, __half)
|
||||
TEST_DATA_ALL_TO_ALL(float32, float)
|
||||
TEST_DATA_ALL_TO_ALL(int32, int)
|
||||
@@ -21,6 +21,13 @@ from .mscclpp_mpi import MpiGroup, parametrize_mpi_groups, mpi_group
|
||||
# FP8 E4M3 (hardware) requires SM >= 89 (Ada / Hopper) on NVIDIA GPUs.
|
||||
# On AMD/ROCm (e.g. MI300X), FP8 is supported natively — no skip needed.
|
||||
_is_hip = hasattr(cp.cuda.runtime, "is_hip") and cp.cuda.runtime.is_hip
|
||||
_gcn_arch_name = ""
|
||||
if _is_hip:
|
||||
_gcn_arch_name = cp.cuda.runtime.getDeviceProperties(0).get("gcnArchName", b"")
|
||||
if isinstance(_gcn_arch_name, bytes):
|
||||
_gcn_arch_name = _gcn_arch_name.decode()
|
||||
_gcn_arch_name = _gcn_arch_name.split(":", maxsplit=1)[0]
|
||||
_is_cdna4 = _gcn_arch_name.startswith("gfx95")
|
||||
_skip_fp8 = not _is_hip and int(cp.cuda.Device().compute_capability) < 89
|
||||
pytestmark = pytest.mark.skipif(_skip_fp8, reason="FP8 accum tests require SM >= 89 on CUDA")
|
||||
|
||||
@@ -90,7 +97,78 @@ def float_to_e4m3fn(f32_array, chunk_size=65536):
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# FP8 E4M3B15 helpers (bias=15, max=0.9375, NaN = exp==15 or bits==0x80)
|
||||
# FP8 E4M3FNUZ helpers (AMD/ROCm; bias=8, max=240, NaN = bits==0x80, no -0)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def e4m3fnuz_to_float(uint8_array):
|
||||
"""Decode a cupy uint8 array of E4M3FNUZ bit patterns to float32."""
|
||||
bits = uint8_array.astype(cp.int32)
|
||||
sign = (bits >> 7) & 1
|
||||
exp = (bits >> 3) & 0xF
|
||||
mant = bits & 0x7
|
||||
|
||||
# Normal: (-1)^s * 2^(exp-8) * (1 + mant/8)
|
||||
normal_val = cp.ldexp(cp.float32(1.0) + mant.astype(cp.float32) / cp.float32(8.0), (exp - 8).astype(cp.int32))
|
||||
# Subnormal (exp==0): (-1)^s * 2^(-7) * (mant/8)
|
||||
subnormal_val = cp.ldexp(mant.astype(cp.float32) / cp.float32(8.0), cp.int32(-7))
|
||||
|
||||
result = cp.where(exp == 0, subnormal_val, normal_val)
|
||||
result = cp.where(sign == 1, -result, result)
|
||||
# Zero is only 0x00; the 0x80 encoding is reserved for NaN under fnuz.
|
||||
result = cp.where(uint8_array.astype(cp.int32) == 0, cp.float32(0.0), result)
|
||||
nan_mask = uint8_array.astype(cp.int32) == 0x80
|
||||
result = cp.where(nan_mask, cp.float32(float("nan")), result)
|
||||
return result
|
||||
|
||||
|
||||
def float_to_e4m3fnuz(f32_array, chunk_size=65536):
|
||||
"""Encode a cupy float32 array to uint8 E4M3FNUZ bit patterns.
|
||||
|
||||
Same lookup-table approach as float_to_e4m3fn but using the fnuz table.
|
||||
"""
|
||||
all_bytes = cp.arange(128, dtype=cp.uint8)
|
||||
all_floats = e4m3fnuz_to_float(all_bytes)
|
||||
all_floats = cp.where(cp.isnan(all_floats), cp.float32(float("inf")), all_floats)
|
||||
|
||||
clamped = f32_array.astype(cp.float32)
|
||||
clamped = cp.clip(clamped, -240.0, 240.0)
|
||||
signs = (clamped < 0).astype(cp.uint8)
|
||||
absval = cp.abs(clamped)
|
||||
|
||||
result = cp.zeros(absval.shape, dtype=cp.uint8)
|
||||
n = absval.size
|
||||
absval_flat = absval.ravel()
|
||||
result_flat = result.ravel()
|
||||
|
||||
for start in range(0, n, chunk_size):
|
||||
end = min(start + chunk_size, n)
|
||||
chunk = absval_flat[start:end]
|
||||
diffs = cp.abs(chunk[:, None] - all_floats[None, :])
|
||||
result_flat[start:end] = cp.argmin(diffs, axis=1).astype(cp.uint8)
|
||||
|
||||
result = result_flat.reshape(absval.shape)
|
||||
result = result | (signs << 7)
|
||||
# 0x80 is NaN under fnuz (no negative zero). Collapse any encoding that
|
||||
# landed on 0x80 (small negatives quantised to zero magnitude) to 0x00.
|
||||
result = cp.where(result == 0x80, cp.uint8(0), result)
|
||||
return result
|
||||
|
||||
|
||||
# Platform-aware E4M3 native helpers: ROCm CDNA4 and CUDA use OCP fn; older ROCm uses fnuz.
|
||||
if _is_hip and not _is_cdna4:
|
||||
e4m3_native_to_float = e4m3fnuz_to_float
|
||||
float_to_e4m3_native = float_to_e4m3fnuz
|
||||
fp8_native_dtype = DataType.float8_e4m3fnuz
|
||||
else:
|
||||
e4m3_native_to_float = e4m3fn_to_float
|
||||
float_to_e4m3_native = float_to_e4m3fn
|
||||
fp8_native_dtype = DataType.float8_e4m3fn
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# FP8 E4M3B15 helpers (bias=15, encode saturates to ±1.75, no NaN)
|
||||
# Matches Triton's fp8e4b15: all 256 bit patterns are finite.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@@ -108,11 +186,6 @@ def e4m3b15_to_float(uint8_array):
|
||||
|
||||
result = cp.where(exp == 0, subnormal_val, normal_val)
|
||||
result = cp.where(sign == 1, -result, result)
|
||||
# Zero
|
||||
result = cp.where((exp == 0) & (mant == 0), cp.float32(0.0), result)
|
||||
# NaN: exp==15 or negative zero (0x80)
|
||||
nan_mask = (exp == 15) | (uint8_array.astype(cp.int32) == 0x80)
|
||||
result = cp.where(nan_mask, cp.float32(float("nan")), result)
|
||||
return result
|
||||
|
||||
|
||||
@@ -120,18 +193,17 @@ def float_to_e4m3b15(f32_array, chunk_size=65536):
|
||||
"""Encode a cupy float32 array to uint8 E4M3B15 bit patterns.
|
||||
|
||||
Same lookup-table approach as float_to_e4m3fn.
|
||||
Saturates to ±1.75 (0x7e/0xfe), matching Triton's fp8e4b15.
|
||||
"""
|
||||
# Build lookup table of all 128 positive E4M3B15 values (0x00..0x7F)
|
||||
all_bytes = cp.arange(128, dtype=cp.uint8)
|
||||
all_floats = e4m3b15_to_float(all_bytes) # (128,) float32
|
||||
# Mark NaN entries as inf so they're never selected as nearest
|
||||
all_floats = cp.where(cp.isnan(all_floats), cp.float32(float("inf")), all_floats)
|
||||
|
||||
# Clamp input and extract sign
|
||||
clamped = f32_array.astype(cp.float32)
|
||||
clamped = cp.clip(clamped, -0.9375, 0.9375)
|
||||
signs = (clamped < 0).astype(cp.uint8)
|
||||
absval = cp.abs(clamped)
|
||||
# Clamp input and extract sign.
|
||||
values = f32_array.astype(cp.float32)
|
||||
signs = cp.signbit(values).astype(cp.uint8)
|
||||
absval = cp.abs(values)
|
||||
absval = cp.clip(absval, cp.float32(0.0), cp.float32(1.75))
|
||||
|
||||
result = cp.zeros(absval.shape, dtype=cp.uint8)
|
||||
n = absval.size
|
||||
@@ -148,8 +220,6 @@ def float_to_e4m3b15(f32_array, chunk_size=65536):
|
||||
# Combine with sign bit
|
||||
result = result_flat.reshape(absval.shape)
|
||||
result = result | (signs << 7)
|
||||
# Handle exact zero
|
||||
result = cp.where(absval == 0, cp.uint8(0), result)
|
||||
return result
|
||||
|
||||
|
||||
@@ -226,12 +296,6 @@ def test_fp8_e4m3_accum(mpi_group: MpiGroup, algo_name: str, size: int):
|
||||
|
||||
buf = GpuBuffer(size, dtype=cp.uint8)
|
||||
|
||||
accum_configs = [
|
||||
("fp8_native", DataType.float8_e4m3),
|
||||
("float16", DataType.float16),
|
||||
("float32", DataType.float32),
|
||||
]
|
||||
|
||||
# rsag_zero_copy and fullmesh need explicit block/thread counts
|
||||
if "rsag" in algo_name:
|
||||
nb = max(1, min(32, size // (world_size * 32)))
|
||||
@@ -243,13 +307,19 @@ def test_fp8_e4m3_accum(mpi_group: MpiGroup, algo_name: str, size: int):
|
||||
nb = 0
|
||||
nt = 0
|
||||
|
||||
accum_configs = [
|
||||
("fp8_native", fp8_native_dtype),
|
||||
("float16", DataType.float16),
|
||||
("float32", DataType.float32),
|
||||
]
|
||||
|
||||
errors = {}
|
||||
for accum_label, accum_dtype in accum_configs:
|
||||
# Generate deterministic per-rank data (use numpy to avoid hipRAND issues on ROCm)
|
||||
rng = np.random.RandomState(42 + rank)
|
||||
src_f32 = cp.asarray(rng.randn(size).astype(np.float32))
|
||||
src_f32 = cp.clip(src_f32, -240.0, 240.0)
|
||||
src_fp8 = float_to_e4m3fn(src_f32)
|
||||
src_fp8 = float_to_e4m3_native(src_f32)
|
||||
|
||||
# Copy into symmetric buffer
|
||||
buf[:] = src_fp8
|
||||
@@ -260,12 +330,12 @@ def test_fp8_e4m3_accum(mpi_group: MpiGroup, algo_name: str, size: int):
|
||||
algo,
|
||||
comm_group,
|
||||
buf,
|
||||
dtype=DataType.float8_e4m3,
|
||||
dtype=fp8_native_dtype,
|
||||
accum_dtype=accum_dtype,
|
||||
nblocks=nb,
|
||||
nthreads_per_block=nt,
|
||||
)
|
||||
result_f32 = e4m3fn_to_float(result)
|
||||
result_f32 = e4m3_native_to_float(result)
|
||||
|
||||
# Compute float32 reference: sum all ranks' quantized FP8 inputs in float32
|
||||
ref_f32 = cp.zeros(size, dtype=cp.float32)
|
||||
@@ -273,12 +343,13 @@ def test_fp8_e4m3_accum(mpi_group: MpiGroup, algo_name: str, size: int):
|
||||
rng_r = np.random.RandomState(42 + r)
|
||||
rank_data = cp.asarray(rng_r.randn(size).astype(np.float32))
|
||||
rank_data = cp.clip(rank_data, -240.0, 240.0)
|
||||
rank_data_fp8 = float_to_e4m3fn(rank_data)
|
||||
ref_f32 += e4m3fn_to_float(rank_data_fp8)
|
||||
rank_data_fp8 = float_to_e4m3_native(rank_data)
|
||||
ref_f32 += e4m3_native_to_float(rank_data_fp8)
|
||||
|
||||
# Compute errors
|
||||
abs_err = cp.abs(result_f32 - ref_f32)
|
||||
mean_abs_err = float(cp.mean(abs_err))
|
||||
# Compute errors (only on valid, non-NaN entries)
|
||||
valid = ~cp.isnan(result_f32) & ~cp.isnan(ref_f32)
|
||||
abs_err = cp.abs(result_f32[valid] - ref_f32[valid])
|
||||
mean_abs_err = float(cp.mean(abs_err)) if abs_err.size > 0 else 0.0
|
||||
errors[accum_label] = mean_abs_err
|
||||
|
||||
# Reset between runs
|
||||
@@ -341,13 +412,10 @@ def test_fp8_e4m3b15_accum(mpi_group: MpiGroup, algo_name: str, size: int):
|
||||
|
||||
errors = {}
|
||||
for accum_label, accum_dtype in accum_configs:
|
||||
# Generate deterministic per-rank random uint8 values in valid e4m3b15 range
|
||||
# Generate deterministic per-rank random uint8 values covering the full e4m3b15 range.
|
||||
# All 256 bit patterns are valid (no NaN in this format).
|
||||
rng = np.random.RandomState(42 + rank)
|
||||
raw = cp.asarray(rng.randint(0, 0x78, (size,)).astype(np.uint8))
|
||||
signs = cp.asarray(rng.randint(0, 2, (size,)).astype(np.uint8)) << 7
|
||||
src_uint8 = raw | signs
|
||||
# Fix negative zero -> positive zero
|
||||
src_uint8 = cp.where(src_uint8 == 0x80, cp.uint8(0), src_uint8)
|
||||
src_uint8 = cp.asarray(rng.randint(0, 256, (size,)).astype(np.uint8))
|
||||
|
||||
# Copy into symmetric buffer
|
||||
buf[:] = src_uint8
|
||||
@@ -371,19 +439,15 @@ def test_fp8_e4m3b15_accum(mpi_group: MpiGroup, algo_name: str, size: int):
|
||||
ref_f32 = cp.zeros(size, dtype=cp.float32)
|
||||
for r in range(world_size):
|
||||
rng_r = np.random.RandomState(42 + r)
|
||||
raw_r = cp.asarray(rng_r.randint(0, 0x78, (size,)).astype(np.uint8))
|
||||
signs_r = cp.asarray(rng_r.randint(0, 2, (size,)).astype(np.uint8)) << 7
|
||||
bits_r = raw_r | signs_r
|
||||
bits_r = cp.where(bits_r == 0x80, cp.uint8(0), bits_r)
|
||||
bits_r = cp.asarray(rng_r.randint(0, 256, (size,)).astype(np.uint8))
|
||||
ref_f32 += e4m3b15_to_float(bits_r)
|
||||
|
||||
# Clamp reference to e4m3b15 representable range
|
||||
ref_f32 = cp.clip(ref_f32, -0.9375, 0.9375)
|
||||
ref_f32 = cp.clip(ref_f32, -1.75, 1.75)
|
||||
|
||||
# Compute errors (only on valid entries)
|
||||
valid = ~cp.isnan(result_f32) & ~cp.isnan(ref_f32)
|
||||
abs_err = cp.abs(result_f32[valid] - ref_f32[valid])
|
||||
mean_abs_err = float(cp.mean(abs_err)) if abs_err.size > 0 else 0.0
|
||||
# Compute errors
|
||||
abs_err = cp.abs(result_f32 - ref_f32)
|
||||
mean_abs_err = float(cp.mean(abs_err))
|
||||
errors[accum_label] = mean_abs_err
|
||||
|
||||
algo.reset()
|
||||
|
||||
@@ -59,6 +59,10 @@ if(MSCCLPP_NPKIT_FLAGS)
|
||||
target_compile_definitions(mscclpp_obj PRIVATE ${MSCCLPP_NPKIT_FLAGS})
|
||||
endif()
|
||||
|
||||
if(MSCCLPP_USE_MRC)
|
||||
target_compile_definitions(mscclpp_obj PRIVATE MSCCLPP_USE_MRC)
|
||||
endif()
|
||||
|
||||
# libmscclpp
|
||||
add_library(mscclpp SHARED)
|
||||
target_link_libraries(mscclpp PUBLIC mscclpp_obj)
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
|
||||
#include <filesystem>
|
||||
#include <mscclpp/algorithm.hpp>
|
||||
#include <mscclpp/errors.hpp>
|
||||
#include <mscclpp/gpu_utils.hpp>
|
||||
|
||||
#include "logger.hpp"
|
||||
@@ -182,13 +183,41 @@ CommResult DslAlgorithm::execute(std::shared_ptr<Communicator> comm, const void*
|
||||
stream);
|
||||
break;
|
||||
#if defined(__FP8_TYPES_EXIST__)
|
||||
case DataType::FLOAT8_E4M3:
|
||||
executor->execute(rank, (__fp8_e4m3*)input, (__fp8_e4m3*)output, inputSize, outputSize, DataType::FLOAT8_E4M3,
|
||||
case DataType::FLOAT8_E4M3FN:
|
||||
#if defined(__FP8_E4M3_IS_FNUZ__)
|
||||
THROW(EXEC, Error, ErrorCode::InvalidUsage,
|
||||
"FLOAT8_E4M3FN is not natively supported on this platform; use FLOAT8_E4M3FNUZ");
|
||||
#else
|
||||
executor->execute(rank, (__fp8_e4m3*)input, (__fp8_e4m3*)output, inputSize, outputSize, DataType::FLOAT8_E4M3FN,
|
||||
plan_, stream);
|
||||
#endif
|
||||
break;
|
||||
case DataType::FLOAT8_E4M3FNUZ:
|
||||
#if defined(__FP8_E4M3_IS_FNUZ__)
|
||||
executor->execute(rank, (__fp8_e4m3*)input, (__fp8_e4m3*)output, inputSize, outputSize, DataType::FLOAT8_E4M3FNUZ,
|
||||
plan_, stream);
|
||||
#else
|
||||
THROW(EXEC, Error, ErrorCode::InvalidUsage,
|
||||
"FLOAT8_E4M3FNUZ is not natively supported on this platform; use FLOAT8_E4M3FN");
|
||||
#endif
|
||||
break;
|
||||
case DataType::FLOAT8_E5M2:
|
||||
#if defined(__FP8_E5M2_IS_FNUZ__)
|
||||
THROW(EXEC, Error, ErrorCode::InvalidUsage,
|
||||
"FLOAT8_E5M2 is not natively supported on this platform; use FLOAT8_E5M2FNUZ");
|
||||
#else
|
||||
executor->execute(rank, (__fp8_e5m2*)input, (__fp8_e5m2*)output, inputSize, outputSize, DataType::FLOAT8_E5M2,
|
||||
plan_, stream);
|
||||
#endif
|
||||
break;
|
||||
case DataType::FLOAT8_E5M2FNUZ:
|
||||
#if defined(__FP8_E5M2_IS_FNUZ__)
|
||||
executor->execute(rank, (__fp8_e5m2*)input, (__fp8_e5m2*)output, inputSize, outputSize, DataType::FLOAT8_E5M2FNUZ,
|
||||
plan_, stream);
|
||||
#else
|
||||
THROW(EXEC, Error, ErrorCode::InvalidUsage,
|
||||
"FLOAT8_E5M2FNUZ is not natively supported on this platform; use FLOAT8_E5M2");
|
||||
#endif
|
||||
break;
|
||||
#endif
|
||||
case DataType::FLOAT8_E4M3B15:
|
||||
@@ -230,4 +259,4 @@ std::pair<std::shared_ptr<void>, size_t> getFlagBuffer() {
|
||||
return {ptr, gDefaultFlagCount * sizeof(uint32_t)};
|
||||
}
|
||||
|
||||
} // namespace mscclpp
|
||||
} // namespace mscclpp
|
||||
|
||||
@@ -7,6 +7,36 @@
|
||||
|
||||
namespace mscclpp {
|
||||
|
||||
namespace {
|
||||
|
||||
template <typename T, typename Impl, typename Func>
|
||||
std::shared_future<T> makeOrderedRecvFuture(Impl* impl, int remoteRank, int tag, Func func) {
|
||||
// Weak placeholder to avoid a reference cycle; updated with the real recvItem after the future is created.
|
||||
auto thisRecvItem = std::make_shared<std::weak_ptr<BaseRecvItem>>();
|
||||
auto future = std::async(std::launch::deferred,
|
||||
[impl, remoteRank, tag, thisRecvItem, lastRecvItem = impl->getLastRecvItem(remoteRank, tag),
|
||||
func = std::move(func)]() mutable {
|
||||
auto cleanup = [impl, remoteRank, tag, thisRecvItem]() {
|
||||
impl->clearLastRecvItemIfMatches(remoteRank, tag, thisRecvItem->lock());
|
||||
};
|
||||
|
||||
if (lastRecvItem) {
|
||||
// Recursive call to the previous receive items
|
||||
lastRecvItem->wait();
|
||||
}
|
||||
auto result = func();
|
||||
cleanup();
|
||||
return result;
|
||||
});
|
||||
auto sharedFuture = std::shared_future<T>(std::move(future));
|
||||
auto recvItem = std::make_shared<RecvItem<T>>(sharedFuture);
|
||||
*thisRecvItem = recvItem;
|
||||
impl->setLastRecvItem(remoteRank, tag, recvItem);
|
||||
return sharedFuture;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
Communicator::Impl::Impl(std::shared_ptr<Bootstrap> bootstrap, std::shared_ptr<Context> context)
|
||||
: bootstrap_(bootstrap) {
|
||||
if (!context) {
|
||||
@@ -32,6 +62,14 @@ std::shared_ptr<BaseRecvItem> Communicator::Impl::getLastRecvItem(int remoteRank
|
||||
return it->second;
|
||||
}
|
||||
|
||||
void Communicator::Impl::clearLastRecvItemIfMatches(int remoteRank, int tag,
|
||||
const std::shared_ptr<BaseRecvItem>& expectedItem) {
|
||||
auto it = lastRecvItems_.find({remoteRank, tag});
|
||||
if (it != lastRecvItems_.end() && it->second == expectedItem) {
|
||||
lastRecvItems_.erase(it);
|
||||
}
|
||||
}
|
||||
|
||||
MSCCLPP_API_CPP Communicator::~Communicator() = default;
|
||||
|
||||
MSCCLPP_API_CPP Communicator::Communicator(std::shared_ptr<Bootstrap> bootstrap, std::shared_ptr<Context> context)
|
||||
@@ -83,19 +121,11 @@ MSCCLPP_API_CPP std::shared_future<RegisteredMemory> Communicator::recvMemory(in
|
||||
locRecvMemList.push_back(std::move(locRecvMem));
|
||||
return future;
|
||||
}
|
||||
auto future = std::async(std::launch::deferred,
|
||||
[this, remoteRank, tag, lastRecvItem = pimpl_->getLastRecvItem(remoteRank, tag)]() {
|
||||
if (lastRecvItem) {
|
||||
// Recursive call to the previous receive items
|
||||
lastRecvItem->wait();
|
||||
}
|
||||
std::vector<char> data;
|
||||
bootstrap()->recv(data, remoteRank, tag);
|
||||
return RegisteredMemory::deserialize(data);
|
||||
});
|
||||
auto shared_future = std::shared_future<RegisteredMemory>(std::move(future));
|
||||
pimpl_->setLastRecvItem(remoteRank, tag, std::make_shared<RecvItem<RegisteredMemory>>(shared_future));
|
||||
return shared_future;
|
||||
return makeOrderedRecvFuture<RegisteredMemory>(pimpl_.get(), remoteRank, tag, [this, remoteRank, tag]() {
|
||||
std::vector<char> data;
|
||||
bootstrap()->recv(data, remoteRank, tag);
|
||||
return RegisteredMemory::deserialize(data);
|
||||
});
|
||||
}
|
||||
|
||||
MSCCLPP_API_CPP std::shared_future<Connection> Communicator::connect(const Endpoint& localEndpoint, int remoteRank,
|
||||
@@ -112,22 +142,15 @@ MSCCLPP_API_CPP std::shared_future<Connection> Communicator::connect(const Endpo
|
||||
|
||||
bootstrap()->send(localEndpoint.serialize(), remoteRank, tag);
|
||||
|
||||
auto future = std::async(std::launch::deferred, [this, remoteRank, tag, localEndpoint,
|
||||
lastRecvItem = pimpl_->getLastRecvItem(remoteRank, tag)]() mutable {
|
||||
if (lastRecvItem) {
|
||||
// Recursive call to the previous receive items
|
||||
lastRecvItem->wait();
|
||||
}
|
||||
std::vector<char> data;
|
||||
bootstrap()->recv(data, remoteRank, tag);
|
||||
auto remoteEndpoint = Endpoint::deserialize(data);
|
||||
auto connection = context()->connect(localEndpoint, remoteEndpoint);
|
||||
pimpl_->connectionInfos_[connection.impl_.get()] = {remoteRank, tag};
|
||||
return connection;
|
||||
});
|
||||
auto shared_future = std::shared_future<Connection>(std::move(future));
|
||||
pimpl_->setLastRecvItem(remoteRank, tag, std::make_shared<RecvItem<Connection>>(shared_future));
|
||||
return shared_future;
|
||||
return makeOrderedRecvFuture<Connection>(pimpl_.get(), remoteRank, tag,
|
||||
[this, remoteRank, tag, localEndpoint]() mutable {
|
||||
std::vector<char> data;
|
||||
bootstrap()->recv(data, remoteRank, tag);
|
||||
auto remoteEndpoint = Endpoint::deserialize(data);
|
||||
auto connection = context()->connect(localEndpoint, remoteEndpoint);
|
||||
pimpl_->connectionInfos_[connection.impl_.get()] = {remoteRank, tag};
|
||||
return connection;
|
||||
});
|
||||
}
|
||||
|
||||
MSCCLPP_API_CPP std::shared_future<Connection> Communicator::connect(const EndpointConfig& localConfig, int remoteRank,
|
||||
@@ -141,21 +164,12 @@ MSCCLPP_API_CPP std::shared_future<Semaphore> Communicator::buildSemaphore(const
|
||||
SemaphoreStub localStub(connection);
|
||||
bootstrap()->send(localStub.serialize(), remoteRank, tag);
|
||||
|
||||
auto future =
|
||||
std::async(std::launch::deferred, [this, remoteRank, tag, lastRecvItem = pimpl_->getLastRecvItem(remoteRank, tag),
|
||||
localStub = localStub]() mutable {
|
||||
if (lastRecvItem) {
|
||||
// Recursive call to the previous receive items
|
||||
lastRecvItem->wait();
|
||||
}
|
||||
std::vector<char> data;
|
||||
bootstrap()->recv(data, remoteRank, tag);
|
||||
auto remoteStub = SemaphoreStub::deserialize(data);
|
||||
return Semaphore(localStub, remoteStub);
|
||||
});
|
||||
auto shared_future = std::shared_future<Semaphore>(std::move(future));
|
||||
pimpl_->setLastRecvItem(remoteRank, tag, std::make_shared<RecvItem<Semaphore>>(shared_future));
|
||||
return shared_future;
|
||||
return makeOrderedRecvFuture<Semaphore>(pimpl_.get(), remoteRank, tag, [this, remoteRank, tag, localStub]() mutable {
|
||||
std::vector<char> data;
|
||||
bootstrap()->recv(data, remoteRank, tag);
|
||||
auto remoteStub = SemaphoreStub::deserialize(data);
|
||||
return Semaphore(localStub, remoteStub);
|
||||
});
|
||||
}
|
||||
|
||||
MSCCLPP_API_CPP int Communicator::remoteRankOf(const Connection& connection) {
|
||||
|
||||
@@ -7,13 +7,13 @@
|
||||
#include <mscclpp/npkit/npkit.hpp>
|
||||
#endif
|
||||
|
||||
#include <mscclpp/atomic_device.hpp>
|
||||
#include <mscclpp/numa.hpp>
|
||||
#include <mscclpp/utils.hpp>
|
||||
#include <sstream>
|
||||
#include <thread>
|
||||
|
||||
#include "api.h"
|
||||
#include "atomic.hpp"
|
||||
#include "context.hpp"
|
||||
#include "endpoint.hpp"
|
||||
#include "gpu_utils_internal.hpp"
|
||||
@@ -43,7 +43,10 @@ const RegisteredMemory::Impl& BaseConnection::getImpl(const RegisteredMemory& me
|
||||
Context::Impl& BaseConnection::getImpl(Context& context) { return *(context.pimpl_); }
|
||||
|
||||
MSCCLPP_API_CPP BaseConnection::BaseConnection(std::shared_ptr<Context> context, const Endpoint& localEndpoint)
|
||||
: context_(context), localEndpoint_(localEndpoint), maxWriteQueueSize_(localEndpoint.maxWriteQueueSize()) {}
|
||||
: context_(context),
|
||||
localEndpoint_(localEndpoint),
|
||||
maxWriteQueueSize_(localEndpoint.maxWriteQueueSize()),
|
||||
gpuFlushDonePos_(detail::gpuCallocHostShared<uint64_t>()) {}
|
||||
|
||||
MSCCLPP_API_CPP std::shared_ptr<Context> BaseConnection::context() const { return context_; }
|
||||
|
||||
@@ -86,6 +89,17 @@ MSCCLPP_API_CPP int Connection::getMaxWriteQueueSize() const { return impl_->get
|
||||
CudaIpcConnection::CudaIpcConnection(std::shared_ptr<Context> context, const Endpoint& localEndpoint,
|
||||
const Endpoint& remoteEndpoint)
|
||||
: BaseConnection(context, localEndpoint) {
|
||||
// Log fabric/MNNVL availability exactly once per process so any later cross-node CudaIpc failure
|
||||
// is easy to triage. C++11 magic statics make this thread-safe without an explicit mutex.
|
||||
// NOTE: assigning the message to a std::string first avoids the logger's pointer-formatting
|
||||
// overload from kicking in on the const char* result of the ternary.
|
||||
[[maybe_unused]] static const bool fabricAvailable_ = []() {
|
||||
const bool avail = isFabricMemHandleAvailable();
|
||||
const std::string status = avail ? "available (cross-node CudaIpc via MNNVL/IMEX is supported)"
|
||||
: "NOT available (CudaIpc is restricted to intra-node ranks on this system)";
|
||||
INFO(CONN, "CudaIpc transport selected: fabric handles ", status);
|
||||
return avail;
|
||||
}();
|
||||
if (localEndpoint.transport() != Transport::CudaIpc || remoteEndpoint.transport() != Transport::CudaIpc) {
|
||||
THROW(CONN, Error, ErrorCode::InternalError, "CudaIpc transport is required for CudaIpcConnection");
|
||||
}
|
||||
@@ -507,6 +521,35 @@ void IBConnection::atomicAdd(RegisteredMemory dst, uint64_t dstOffset, int64_t v
|
||||
INFO(CONN, "IBConnection atomicAdd: dst ", (uint8_t*)dstMrInfo.addr + dstOffset, ", value ", value);
|
||||
}
|
||||
|
||||
void IBConnection::requestFlush() {
|
||||
// No-op: IB sends were already posted by prior conn.write() calls in handleTrigger.
|
||||
// progressFlush() drives completion by polling the send CQ.
|
||||
}
|
||||
|
||||
bool IBConnection::progressFlush() {
|
||||
if (recvThreadError_.load(std::memory_order_acquire)) {
|
||||
THROW(CONN, Error, ErrorCode::SystemError, "IBConnection recv thread failed: ", recvThreadErrorMsg_);
|
||||
}
|
||||
|
||||
auto qp = qp_.lock();
|
||||
if (!qp || qp->getNumSendCqItems() == 0) {
|
||||
return true; // QP expired or CQ already drained.
|
||||
}
|
||||
|
||||
int wcNum = qp->pollSendCq();
|
||||
if (wcNum < 0) {
|
||||
THROW(NET, IbError, errno, "pollSendCq failed in progressFlush");
|
||||
}
|
||||
for (int i = 0; i < wcNum; ++i) {
|
||||
int status = qp->getSendWcStatus(i);
|
||||
if (status != static_cast<int>(WsStatus::Success)) {
|
||||
THROW(NET, Error, ErrorCode::SystemError,
|
||||
"an IB work item failed in progressFlush: ", qp->getSendWcStatusString(i));
|
||||
}
|
||||
}
|
||||
return qp->getNumSendCqItems() == 0;
|
||||
}
|
||||
|
||||
// EthernetConnection
|
||||
|
||||
EthernetConnection::EthernetConnection(std::shared_ptr<Context> context, const Endpoint& localEndpoint,
|
||||
|
||||
@@ -4,12 +4,11 @@
|
||||
#include "context.hpp"
|
||||
|
||||
#include <mscclpp/env.hpp>
|
||||
#include <sstream>
|
||||
|
||||
#include "api.h"
|
||||
#include "connection.hpp"
|
||||
#include "debug.h"
|
||||
#include "endpoint.hpp"
|
||||
#include "logger.hpp"
|
||||
#include "registered_memory.hpp"
|
||||
|
||||
namespace mscclpp {
|
||||
@@ -105,19 +104,17 @@ MSCCLPP_API_CPP Endpoint Context::createEndpoint(EndpointConfig config) {
|
||||
|
||||
MSCCLPP_API_CPP Connection Context::connect(const Endpoint& localEndpoint, const Endpoint& remoteEndpoint) {
|
||||
if (localEndpoint.device().type == DeviceType::GPU && localEndpoint.device().id < 0) {
|
||||
throw Error("No GPU device ID provided for local endpoint", ErrorCode::InvalidUsage);
|
||||
THROW(CONN, Error, ErrorCode::InvalidUsage, "No GPU device ID provided for local endpoint");
|
||||
}
|
||||
if (remoteEndpoint.device().type == DeviceType::GPU && remoteEndpoint.device().id < 0) {
|
||||
throw Error("No GPU device ID provided for remote endpoint", ErrorCode::InvalidUsage);
|
||||
THROW(CONN, Error, ErrorCode::InvalidUsage, "No GPU device ID provided for remote endpoint");
|
||||
}
|
||||
auto localTransport = localEndpoint.transport();
|
||||
auto remoteTransport = remoteEndpoint.transport();
|
||||
if (localTransport != remoteTransport &&
|
||||
!(AllIBTransports.has(localTransport) && AllIBTransports.has(remoteTransport))) {
|
||||
std::stringstream ss;
|
||||
ss << "Transport mismatch between local (" << localTransport << ") and remote (" << remoteEndpoint.transport()
|
||||
<< ") endpoints";
|
||||
throw Error(ss.str(), ErrorCode::InvalidUsage);
|
||||
THROW(CONN, Error, ErrorCode::InvalidUsage, "Transport mismatch between local (", localTransport, ") and remote (",
|
||||
remoteTransport, ") endpoints");
|
||||
}
|
||||
std::shared_ptr<BaseConnection> conn;
|
||||
if (localTransport == Transport::CudaIpc) {
|
||||
@@ -127,7 +124,9 @@ MSCCLPP_API_CPP Connection Context::connect(const Endpoint& localEndpoint, const
|
||||
} else if (localTransport == Transport::Ethernet) {
|
||||
conn = std::make_shared<EthernetConnection>(shared_from_this(), localEndpoint, remoteEndpoint);
|
||||
} else {
|
||||
throw Error("Unsupported transport", ErrorCode::InternalError);
|
||||
THROW(CONN, Error, ErrorCode::InternalError, "Unsupported transport: ", localTransport,
|
||||
" (this usually means EndpointConfig.transport was left at Transport::Unknown — "
|
||||
"set it explicitly to CudaIpc, an IB transport, or Ethernet)");
|
||||
}
|
||||
return Connection(conn);
|
||||
}
|
||||
|
||||
@@ -78,8 +78,10 @@ void ExecutionKernel::launchKernel(int rank, int nthreadblocks, int nthreads, vo
|
||||
);
|
||||
#endif
|
||||
break;
|
||||
case DataType::FLOAT8_E4M3:
|
||||
case DataType::FLOAT8_E4M3FN:
|
||||
case DataType::FLOAT8_E4M3FNUZ:
|
||||
case DataType::FLOAT8_E5M2:
|
||||
case DataType::FLOAT8_E5M2FNUZ:
|
||||
// FP8 is not supported in CUDA execution kernel.
|
||||
break;
|
||||
case DataType::FLOAT8_E4M3B15:
|
||||
|
||||
@@ -7,7 +7,6 @@
|
||||
#include <mscclpp/switch_channel.hpp>
|
||||
#include <mscclpp/utils.hpp>
|
||||
|
||||
#include "debug.h"
|
||||
#include "execution_kernel.hpp"
|
||||
#include "execution_plan.hpp"
|
||||
|
||||
@@ -509,8 +508,7 @@ Executor::Executor(std::shared_ptr<Communicator> comm, std::shared_ptr<char> def
|
||||
void Executor::execute(int rank, void* sendbuff, void* recvbuff, size_t sendBuffSize,
|
||||
[[maybe_unused]] size_t recvBuffSize, DataType dataType, const ExecutionPlan& plan,
|
||||
cudaStream_t stream, PacketType packetType) {
|
||||
INFO(MSCCLPP_EXECUTOR, "Starting execution with plan: %s, collective: %s", plan.name().c_str(),
|
||||
plan.collective().c_str());
|
||||
INFO(LogSubsys::EXEC, "Starting execution with plan: ", plan.name(), ", collective: ", plan.collective());
|
||||
size_t sendMemRange, recvMemRange;
|
||||
CUdeviceptr sendBasePtr, recvBasePtr;
|
||||
MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendMemRange, (CUdeviceptr)sendbuff));
|
||||
|
||||
@@ -53,6 +53,8 @@ MSCCLPP_API_CPP void Fifo::pop() {
|
||||
atomicStore(pimpl_->tail.get(), curTail + 1, memoryOrderRelease);
|
||||
}
|
||||
|
||||
MSCCLPP_API_CPP uint64_t Fifo::tail() const { return *(pimpl_->tail); }
|
||||
|
||||
MSCCLPP_API_CPP int Fifo::size() const { return pimpl_->size; }
|
||||
|
||||
MSCCLPP_API_CPP FifoDeviceHandle Fifo::deviceHandle() const {
|
||||
|
||||
@@ -48,7 +48,7 @@ GdrStatus gdrStatus() { return gdrContext()->status(); }
|
||||
|
||||
bool gdrEnabled() { return gdrStatus() == GdrStatus::Ok; }
|
||||
|
||||
const char* gdrStatusMessage() {
|
||||
std::string gdrStatusMessage() {
|
||||
switch (gdrStatus()) {
|
||||
case GdrStatus::Ok:
|
||||
return "GDRCopy initialized successfully";
|
||||
@@ -181,7 +181,7 @@ GdrStatus gdrStatus() { return GdrStatus::NotBuilt; }
|
||||
|
||||
bool gdrEnabled() { return false; }
|
||||
|
||||
const char* gdrStatusMessage() { return "mscclpp was not built with GDRCopy support (MSCCLPP_USE_GDRCOPY not set)"; }
|
||||
std::string gdrStatusMessage() { return "mscclpp was not built with GDRCopy support (MSCCLPP_USE_GDRCOPY not set)"; }
|
||||
|
||||
// GdrMap::Impl — stub (no GDRCopy)
|
||||
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
|
||||
#include <cstring>
|
||||
#include <mscclpp/gpu_utils.hpp>
|
||||
#include <mscclpp/utils.hpp>
|
||||
|
||||
#include "logger.hpp"
|
||||
#include "unix_socket.hpp"
|
||||
@@ -35,7 +36,7 @@ std::ostream& operator<<(std::ostream& os, const GpuIpcMemHandle::TypeFlags& typ
|
||||
return os;
|
||||
}
|
||||
|
||||
[[maybe_unused]] static bool isFabricMemHandleAvailable() {
|
||||
bool isFabricMemHandleAvailable() {
|
||||
#if (CUDA_NVLS_API_AVAILABLE)
|
||||
static int resultCache = -1; // -1: uninitialized, 0: not available, 1: available
|
||||
if (resultCache != -1) {
|
||||
@@ -283,11 +284,19 @@ GpuIpcMem::GpuIpcMem(const GpuIpcMemHandle& handle)
|
||||
THROW(GPU, Error, ErrorCode::InvalidUsage, "GpuIpcMemHandle type is None, cannot create GpuIpcMem");
|
||||
}
|
||||
if ((type_ == GpuIpcMemHandle::Type::None) && (handle_.typeFlags & GpuIpcMemHandle::Type::Fabric)) {
|
||||
if (cuMemImportFromShareableHandle(&allocHandle_, (void*)handle_.fabric.handle, CU_MEM_HANDLE_TYPE_FABRIC) ==
|
||||
CUDA_SUCCESS) {
|
||||
CUresult res =
|
||||
cuMemImportFromShareableHandle(&allocHandle_, (void*)handle_.fabric.handle, CU_MEM_HANDLE_TYPE_FABRIC);
|
||||
if (res == CUDA_SUCCESS) {
|
||||
// Ignore allocHandle in the handle struct since it is process-local and not transferable across processes.
|
||||
handle_.fabric.allocHandle = {};
|
||||
type_ = GpuIpcMemHandle::Type::Fabric;
|
||||
} else {
|
||||
const char* errStr = nullptr;
|
||||
(void)cuGetErrorString(res, &errStr);
|
||||
const std::string errMsg = errStr ? std::string(errStr) : std::string("unknown CUDA error");
|
||||
WARN(GPU, "Fabric IPC handle import failed (", errMsg,
|
||||
"); cross-node CudaIpc requires NVIDIA MNNVL hardware and a running IMEX service. ",
|
||||
"Falling back to other handle types if available.");
|
||||
}
|
||||
}
|
||||
if ((type_ == GpuIpcMemHandle::Type::None) && (handle_.typeFlags & GpuIpcMemHandle::Type::PosixFd)) {
|
||||
@@ -303,7 +312,17 @@ GpuIpcMem::GpuIpcMem(const GpuIpcMemHandle& handle)
|
||||
type_ = GpuIpcMemHandle::Type::RuntimeIpc;
|
||||
}
|
||||
if (type_ == GpuIpcMemHandle::Type::None) {
|
||||
THROW(GPU, Error, ErrorCode::Aborted, "Failed to open GpuIpcMemHandle (type: ", handle_.typeFlags, ")");
|
||||
const bool fabricOnly = (handle_.typeFlags == GpuIpcMemHandle::Type::Fabric);
|
||||
const std::string hint = fabricOnly
|
||||
? std::string(
|
||||
"The remote rank sent only a Fabric (MNNVL) handle, but this rank could not "
|
||||
"import it. Check that the IMEX daemon is running on both nodes and that the "
|
||||
"GPUs share an NVLink fabric.")
|
||||
: std::string(
|
||||
"All handle types failed to import; check IMEX service and POSIX FD socket "
|
||||
"availability.");
|
||||
THROW(GPU, Error, ErrorCode::Aborted, "Failed to open GpuIpcMemHandle (offered types: ", handle_.typeFlags, "). ",
|
||||
hint);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -267,6 +267,13 @@ void gpuMemcpy(void* dst, const void* src, size_t bytes, cudaMemcpyKind kind) {
|
||||
MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
|
||||
}
|
||||
|
||||
void gpuMemset(void* ptr, int value, size_t bytes) {
|
||||
AvoidCudaGraphCaptureGuard cgcGuard;
|
||||
CudaStreamWithFlags stream(cudaStreamNonBlocking);
|
||||
MSCCLPP_CUDATHROW(cudaMemsetAsync(ptr, value, bytes, stream));
|
||||
MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
bool isNvlsSupported() {
|
||||
@@ -283,7 +290,9 @@ bool isNvlsSupported() {
|
||||
MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId));
|
||||
MSCCLPP_CUTHROW(cuDeviceGet(&dev, deviceId));
|
||||
MSCCLPP_CUTHROW(cuDeviceGetAttribute(&isMulticastSupported, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, dev));
|
||||
return isMulticastSupported == 1;
|
||||
result = (isMulticastSupported == 1);
|
||||
isChecked = true;
|
||||
return result;
|
||||
}
|
||||
return result;
|
||||
#endif
|
||||
@@ -300,9 +309,6 @@ bool isCuMemMapAllocated([[maybe_unused]] void* ptr) {
|
||||
return false;
|
||||
}
|
||||
MSCCLPP_CUTHROW(cuMemRelease(handle));
|
||||
if (!isNvlsSupported()) {
|
||||
throw Error("cuMemMap is used in env without NVLS support", ErrorCode::InvalidUsage);
|
||||
}
|
||||
return true;
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -10,11 +10,28 @@
|
||||
|
||||
#include "logger.hpp"
|
||||
|
||||
// Adding MSCCLPP_USE_MRC micro for MRC enablement.
|
||||
// Non-MRC environments will not be affected by this macro as long as VMRC_LIBIBVERBS_SO
|
||||
// environment variable is not set.
|
||||
#if (MSCCLPP_USE_MRC)
|
||||
#include <cstdlib>
|
||||
#include <set>
|
||||
#endif // (MSCCLPP_USE_MRC)
|
||||
|
||||
namespace mscclpp {
|
||||
|
||||
static std::unique_ptr<void, int (*)(void*)> globalIBVerbsHandle(nullptr, &::dlclose);
|
||||
#if (MSCCLPP_USE_MRC)
|
||||
static std::unique_ptr<void, int (*)(void*)> globalOrigIBVerbsHandle(nullptr, &::dlclose);
|
||||
#endif // (MSCCLPP_USE_MRC)
|
||||
|
||||
void* IBVerbs::dlsym(const std::string& symbol, bool allowReturnNull) {
|
||||
#if (MSCCLPP_USE_MRC)
|
||||
static std::set<std::string> mrcSymbols = {
|
||||
"ibv_get_device_list", "ibv_get_device_name", "ibv_open_device", "ibv_close_device", "ibv_query_qp",
|
||||
"ibv_create_cq", "ibv_destroy_cq", "ibv_create_qp", "ibv_modify_qp", "ibv_destroy_qp",
|
||||
};
|
||||
#endif // (MSCCLPP_USE_MRC)
|
||||
if (!globalIBVerbsHandle) {
|
||||
if (mscclpp::env()->ibvSo != "") {
|
||||
void* handle = ::dlopen(mscclpp::env()->ibvSo.c_str(), RTLD_NOW);
|
||||
@@ -38,7 +55,26 @@ void* IBVerbs::dlsym(const std::string& symbol, bool allowReturnNull) {
|
||||
THROW(NET, SysError, errno, "Failed to open libibverbs: ", std::string(::dlerror()));
|
||||
}
|
||||
}
|
||||
#if (MSCCLPP_USE_MRC)
|
||||
// In MRC mode, `VMRC_LIBIBVERBS_SO` should be set.
|
||||
char* vmrcLibibverbsSo = ::getenv("VMRC_LIBIBVERBS_SO");
|
||||
void* ptr;
|
||||
if (vmrcLibibverbsSo != nullptr && mrcSymbols.find(symbol) == mrcSymbols.end()) {
|
||||
// If we are in MRC mode and the symbol is not in the table, get it from the original libibverbs.
|
||||
if (!globalOrigIBVerbsHandle) {
|
||||
void* handle = ::dlopen(vmrcLibibverbsSo, RTLD_NOW);
|
||||
if (!handle) {
|
||||
THROW(NET, SysError, errno, "Failed to open ", std::string(vmrcLibibverbsSo));
|
||||
}
|
||||
globalOrigIBVerbsHandle.reset(handle);
|
||||
}
|
||||
ptr = ::dlsym(globalOrigIBVerbsHandle.get(), symbol.c_str());
|
||||
} else {
|
||||
ptr = ::dlsym(globalIBVerbsHandle.get(), symbol.c_str());
|
||||
}
|
||||
#else // !(MSCCLPP_USE_MRC)
|
||||
void* ptr = ::dlsym(globalIBVerbsHandle.get(), symbol.c_str());
|
||||
#endif // !(MSCCLPP_USE_MRC)
|
||||
if (!ptr && !allowReturnNull) {
|
||||
THROW(NET, SysError, errno, "Failed to load libibverbs symbol: ", symbol);
|
||||
}
|
||||
|
||||
@@ -4,18 +4,16 @@
|
||||
#ifndef MSCCLPP_ATOMIC_HPP_
|
||||
#define MSCCLPP_ATOMIC_HPP_
|
||||
|
||||
#if defined(MSCCLPP_USE_CUDA)
|
||||
#ifndef MSCCLPP_DEVICE_CUDA
|
||||
// On CUDA host-side compiles, force atomic_device.hpp's CUDA branch so host code uses
|
||||
// cuda::atomic_ref (for system-scope ordering with GPU readers). On CUDA device compiles
|
||||
// (MSCCLPP_DEVICE_CUDA already set by device.hpp) and on ROCm builds, include normally —
|
||||
// atomic_device.hpp's branch selection works correctly without forcing.
|
||||
#if defined(MSCCLPP_USE_CUDA) && !defined(MSCCLPP_DEVICE_CUDA)
|
||||
#define MSCCLPP_DEVICE_CUDA
|
||||
#include <mscclpp/atomic_device.hpp>
|
||||
#undef MSCCLPP_DEVICE_CUDA
|
||||
#endif // !defined(MSCCLPP_DEVICE_CUDA)
|
||||
#else // !defined(MSCCLPP_USE_CUDA)
|
||||
#ifndef MSCCLPP_DEVICE_HIP
|
||||
#define MSCCLPP_DEVICE_HIP
|
||||
#else
|
||||
#include <mscclpp/atomic_device.hpp>
|
||||
#undef MSCCLPP_DEVICE_HIP
|
||||
#endif // !defined(MSCCLPP_DEVICE_HIP)
|
||||
#endif // !defined(MSCCLPP_USE_CUDA)
|
||||
#endif
|
||||
|
||||
#endif // MSCCLPP_ATOMIC_HPP_
|
||||
@@ -62,7 +62,7 @@ struct Communicator::Impl {
|
||||
std::unordered_map<const BaseConnection*, ConnectionInfo> connectionInfos_;
|
||||
|
||||
// Temporary storage for the latest RecvItem of each {remoteRank, tag} pair.
|
||||
// If the RecvItem gets ready, it will be removed at the next call to getLastRecvItem.
|
||||
// The RecvItem is removed when it finishes or when getLastRecvItem observes that it is ready.
|
||||
std::unordered_map<std::pair<int, int>, std::shared_ptr<BaseRecvItem>, PairHash> lastRecvItems_;
|
||||
|
||||
// RegisteredMemory items sent to the local rank of each tag. Sending memory to the local rank is
|
||||
@@ -79,6 +79,9 @@ struct Communicator::Impl {
|
||||
// If the item is ready, it will be removed from the map and nullptr will be returned.
|
||||
std::shared_ptr<BaseRecvItem> getLastRecvItem(int remoteRank, int tag);
|
||||
|
||||
// Clear the last RecvItem only if it still matches the expected item.
|
||||
void clearLastRecvItemIfMatches(int remoteRank, int tag, const std::shared_ptr<BaseRecvItem>& expectedItem);
|
||||
|
||||
struct Connector;
|
||||
};
|
||||
|
||||
|
||||
@@ -52,6 +52,23 @@ class BaseConnection {
|
||||
/// When false, the NIC writes directly to the semaphore's registered memory (e.g., via atomics).
|
||||
virtual bool isSignalForwarding() const { return false; }
|
||||
|
||||
/// Request a flush. Subclasses that support async flush (e.g. IBConnection) override this
|
||||
/// to be a no-op and rely on progressFlush() to drive completion via the proxy thread.
|
||||
/// The default does a blocking flush(); progressFlush() then trivially returns true.
|
||||
/// @note Only call from the proxy thread.
|
||||
virtual void requestFlush() { flush(); }
|
||||
|
||||
/// Progress pending async flush operations (non-blocking CQ poll).
|
||||
/// @note Only call from the proxy thread.
|
||||
/// @return true if no flush is pending (CQ fully drained or no request).
|
||||
virtual bool progressFlush() { return true; }
|
||||
|
||||
/// Get pointer to the GPU-visible flush-done position (host-pinned memory).
|
||||
/// ProxyService writes "one past the highest completed FIFO position" here when the CQ
|
||||
/// drains; GPU threads spin on it (`waitFlush`) until it surpasses their own push position.
|
||||
/// @note Pointer is valid for the lifetime of this Connection.
|
||||
uint64_t* getFlushDonePtr() const { return gpuFlushDonePos_.get(); }
|
||||
|
||||
virtual Transport transport() const = 0;
|
||||
|
||||
virtual Transport remoteTransport() const = 0;
|
||||
@@ -77,6 +94,11 @@ class BaseConnection {
|
||||
std::shared_ptr<Context> context_;
|
||||
Endpoint localEndpoint_;
|
||||
int maxWriteQueueSize_;
|
||||
|
||||
// GPU-visible flush-done position (host-pinned memory). ProxyService writes one past the
|
||||
// highest FIFO position whose TriggerSync request has fully completed on this connection
|
||||
// (CQ drained for IB, synchronous flush() returned for non-IB).
|
||||
std::shared_ptr<uint64_t> gpuFlushDonePos_;
|
||||
};
|
||||
|
||||
class CudaIpcConnection : public BaseConnection {
|
||||
@@ -153,6 +175,9 @@ class IBConnection : public BaseConnection {
|
||||
void atomicAdd(RegisteredMemory dst, uint64_t dstOffset, int64_t value) override;
|
||||
|
||||
void flush(int64_t timeoutUsec) override;
|
||||
|
||||
void requestFlush() override;
|
||||
bool progressFlush() override;
|
||||
};
|
||||
|
||||
class EthernetConnection : public BaseConnection {
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
#include <mscclpp/switch_channel_device.hpp>
|
||||
|
||||
#include "execution_common.hpp"
|
||||
#include "logger.hpp"
|
||||
#include "reduce_kernel.hpp"
|
||||
namespace mscclpp {
|
||||
|
||||
@@ -876,7 +877,19 @@ class ExecutionKernel {
|
||||
#endif
|
||||
break;
|
||||
#if defined(__FP8_TYPES_EXIST__)
|
||||
case DataType::FLOAT8_E4M3:
|
||||
case DataType::FLOAT8_E4M3FN:
|
||||
case DataType::FLOAT8_E4M3FNUZ:
|
||||
#if defined(__FP8_E4M3_IS_FNUZ__)
|
||||
if (dataType == DataType::FLOAT8_E4M3FN) {
|
||||
THROW(LogSubsys::EXEC, Error, ErrorCode::InvalidUsage,
|
||||
"FLOAT8_E4M3FN is not natively supported on this platform; use FLOAT8_E4M3FNUZ");
|
||||
}
|
||||
#else
|
||||
if (dataType == DataType::FLOAT8_E4M3FNUZ) {
|
||||
THROW(LogSubsys::EXEC, Error, ErrorCode::InvalidUsage,
|
||||
"FLOAT8_E4M3FNUZ is not natively supported on this platform; use FLOAT8_E4M3FN");
|
||||
}
|
||||
#endif
|
||||
executionKernel<__fp8_e4m3, PacketType, ReuseScratch><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(
|
||||
rank, (__fp8_e4m3*)src, (__fp8_e4m3*)dst, (__fp8_e4m3*)scratch, scratchOffset, scratchChunkSize, plan,
|
||||
semaphores, localMemoryIdBegin, flag
|
||||
@@ -888,6 +901,18 @@ class ExecutionKernel {
|
||||
#endif
|
||||
break;
|
||||
case DataType::FLOAT8_E5M2:
|
||||
case DataType::FLOAT8_E5M2FNUZ:
|
||||
#if defined(__FP8_E5M2_IS_FNUZ__)
|
||||
if (dataType == DataType::FLOAT8_E5M2) {
|
||||
THROW(LogSubsys::EXEC, Error, ErrorCode::InvalidUsage,
|
||||
"FLOAT8_E5M2 is not natively supported on this platform; use FLOAT8_E5M2FNUZ");
|
||||
}
|
||||
#else
|
||||
if (dataType == DataType::FLOAT8_E5M2FNUZ) {
|
||||
THROW(LogSubsys::EXEC, Error, ErrorCode::InvalidUsage,
|
||||
"FLOAT8_E5M2FNUZ is not natively supported on this platform; use FLOAT8_E5M2");
|
||||
}
|
||||
#endif
|
||||
executionKernel<__fp8_e5m2, PacketType, ReuseScratch><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(
|
||||
rank, (__fp8_e5m2*)src, (__fp8_e5m2*)dst, (__fp8_e5m2*)scratch, scratchOffset, scratchChunkSize, plan,
|
||||
semaphores, localMemoryIdBegin, flag
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
namespace mscclpp {
|
||||
|
||||
@@ -25,7 +26,7 @@ GdrStatus gdrStatus();
|
||||
bool gdrEnabled();
|
||||
|
||||
/// Return a human-readable error message for the current GDRCopy status.
|
||||
const char* gdrStatusMessage();
|
||||
std::string gdrStatusMessage();
|
||||
|
||||
/// RAII wrapper for a GDRCopy BAR1 mapping of a GPU address.
|
||||
/// When GDRCopy is not available, all operations are no-ops and valid() returns false.
|
||||
|
||||
38
src/core/include/proxy_impl.hpp
Normal file
38
src/core/include/proxy_impl.hpp
Normal file
@@ -0,0 +1,38 @@
|
||||
// Copyright (c) Microsoft Corporation.
|
||||
// Licensed under the MIT license.
|
||||
|
||||
#ifndef MSCCLPP_PROXY_IMPL_HPP_
|
||||
#define MSCCLPP_PROXY_IMPL_HPP_
|
||||
|
||||
#include <atomic>
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <mscclpp/fifo.hpp>
|
||||
#include <mscclpp/proxy.hpp>
|
||||
#include <thread>
|
||||
|
||||
namespace mscclpp {
|
||||
|
||||
struct Proxy::Impl {
|
||||
ProxyHandler handler;
|
||||
std::function<void()> threadInit;
|
||||
std::function<void()> progressHandler;
|
||||
std::shared_ptr<Fifo> fifo;
|
||||
std::atomic_bool threadStarted;
|
||||
std::thread service;
|
||||
std::atomic_bool running;
|
||||
|
||||
Impl(ProxyHandler handler, std::function<void()> threadInit, int fifoSize)
|
||||
: handler(handler),
|
||||
threadInit(threadInit),
|
||||
fifo(std::make_shared<Fifo>(fifoSize)),
|
||||
threadStarted(false),
|
||||
running(false) {}
|
||||
|
||||
// Must be called before start() — the proxy thread captures progressHandler at start time.
|
||||
void setProgressHandler(std::function<void()> h) { progressHandler = std::move(h); }
|
||||
};
|
||||
|
||||
} // namespace mscclpp
|
||||
|
||||
#endif // MSCCLPP_PROXY_IMPL_HPP_
|
||||
@@ -1,11 +1,14 @@
|
||||
// Copyright (c) Microsoft Corporation.
|
||||
// Licensed under the MIT license.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#include <mscclpp/numa.hpp>
|
||||
#include <mscclpp/port_channel.hpp>
|
||||
|
||||
#include "api.h"
|
||||
#include "debug.h"
|
||||
#include "atomic.hpp"
|
||||
#include "connection.hpp"
|
||||
#include "logger.hpp"
|
||||
#include "proxy_impl.hpp"
|
||||
|
||||
namespace mscclpp {
|
||||
|
||||
@@ -34,11 +37,12 @@ MSCCLPP_API_CPP ProxyService::ProxyService(int fifoSize) {
|
||||
MSCCLPP_CUDATHROW(cudaSetDevice(cudaDevice));
|
||||
if (deviceNumaNode >= 0) {
|
||||
numaBind(deviceNumaNode);
|
||||
INFO(MSCCLPP_INIT, "NUMA node of ProxyService proxy thread is set to %d", deviceNumaNode);
|
||||
INFO(CONN, "NUMA node of ProxyService proxy thread is set to ", deviceNumaNode);
|
||||
}
|
||||
};
|
||||
auto handlerFunc = [&](ProxyTrigger triggerRaw) { return handleTrigger(triggerRaw); };
|
||||
proxy_ = std::make_shared<Proxy>(handlerFunc, initFunc, fifoSize);
|
||||
proxy_->pimpl_->setProgressHandler([this]() { progressFlushes(); });
|
||||
}
|
||||
|
||||
MSCCLPP_API_CPP SemaphoreId ProxyService::buildAndAddSemaphore(Communicator& communicator,
|
||||
@@ -84,9 +88,28 @@ MSCCLPP_API_CPP PortChannel ProxyService::portChannel(SemaphoreId id, MemoryId d
|
||||
|
||||
MSCCLPP_API_CPP void ProxyService::startProxy(bool blocking) { proxy_->start(blocking); }
|
||||
|
||||
MSCCLPP_API_CPP void ProxyService::stopProxy() { proxy_->stop(); }
|
||||
MSCCLPP_API_CPP void ProxyService::stopProxy() {
|
||||
proxy_->stop();
|
||||
// Drain pending flushes. After a bounded loop, force-unblock any still-pending GPU
|
||||
// waiters with a sentinel write (UINT64_MAX > any FIFO position).
|
||||
for (int i = 0; i < 1000 && !pendingFlushPos_.empty(); ++i) {
|
||||
progressFlushes();
|
||||
}
|
||||
if (!pendingFlushPos_.empty()) {
|
||||
WARN(CONN, "stopProxy: ", pendingFlushPos_.size(), " connections still pending; writing sentinel");
|
||||
for (auto& [conn, pos] : pendingFlushPos_) {
|
||||
if (uint64_t* ptr = conn->getFlushDonePtr()) atomicStore(ptr, UINT64_MAX, memoryOrderRelease);
|
||||
}
|
||||
pendingFlushPos_.clear();
|
||||
}
|
||||
}
|
||||
|
||||
ProxyHandlerResult ProxyService::handleTrigger(ProxyTrigger trigger) {
|
||||
// The proxy is the sole FIFO consumer and processes in strict push order, so the FIFO's
|
||||
// tail (between poll() and pop()) matches the value GPU's fifo_.push() returned for this
|
||||
// trigger — use it directly as our per-trigger sequence number.
|
||||
uint64_t pos = proxy_->fifo()->tail();
|
||||
|
||||
std::shared_ptr<Host2DeviceSemaphore> semaphore = semaphores_[trigger.fields.semaphoreId];
|
||||
|
||||
auto& conn = semaphore->connection();
|
||||
@@ -114,9 +137,15 @@ ProxyHandlerResult ProxyService::handleTrigger(ProxyTrigger trigger) {
|
||||
numRequests++;
|
||||
}
|
||||
|
||||
if (((trigger.fields.type & TriggerSync) && numRequests > 0) ||
|
||||
(maxWriteQueueSize != -1 && numRequests >= maxWriteQueueSize)) {
|
||||
conn.flush();
|
||||
if (trigger.fields.type & TriggerSync) {
|
||||
// Record this TriggerSync's FIFO position. The GPU caller is spinning on
|
||||
// flushDonePos_ > pos; progressFlushes() will publish pos+1 once the CQ drains.
|
||||
// Later TriggerSyncs on the same conn overwrite — CQ drain completes them all at once.
|
||||
conn.impl_->requestFlush();
|
||||
pendingFlushPos_[conn.impl_] = pos;
|
||||
numRequests = 0;
|
||||
} else if (maxWriteQueueSize != -1 && numRequests >= maxWriteQueueSize) {
|
||||
conn.flush(); // flow-control flush stays blocking
|
||||
numRequests = 0;
|
||||
}
|
||||
|
||||
@@ -124,12 +153,27 @@ ProxyHandlerResult ProxyService::handleTrigger(ProxyTrigger trigger) {
|
||||
}
|
||||
|
||||
MSCCLPP_API_CPP BasePortChannel::DeviceHandle BasePortChannel::deviceHandle() const {
|
||||
return BasePortChannel::DeviceHandle(semaphoreId_, semaphore_->deviceHandle(), proxy_->fifo()->deviceHandle());
|
||||
auto& conn = semaphore_->connection();
|
||||
return BasePortChannel::DeviceHandle(semaphoreId_, semaphore_->deviceHandle(), proxy_->fifo()->deviceHandle(),
|
||||
conn.impl_->getFlushDonePtr());
|
||||
}
|
||||
|
||||
MSCCLPP_API_CPP PortChannel::DeviceHandle PortChannel::deviceHandle() const {
|
||||
return PortChannel::DeviceHandle(semaphoreId_, semaphore_->deviceHandle(), proxy_->fifo()->deviceHandle(), dst_,
|
||||
src_);
|
||||
auto& conn = semaphore_->connection();
|
||||
return PortChannel::DeviceHandle(semaphoreId_, semaphore_->deviceHandle(), proxy_->fifo()->deviceHandle(), dst_, src_,
|
||||
conn.impl_->getFlushDonePtr());
|
||||
}
|
||||
|
||||
void ProxyService::progressFlushes() {
|
||||
for (auto it = pendingFlushPos_.begin(); it != pendingFlushPos_.end();) {
|
||||
if (it->first->progressFlush()) {
|
||||
// CQ drained: publish pos+1 to unblock GPU waiters whose own pos <= recorded pos.
|
||||
atomicStore(it->first->getFlushDonePtr(), it->second + 1, memoryOrderRelease);
|
||||
it = pendingFlushPos_.erase(it);
|
||||
} else {
|
||||
++it;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace mscclpp
|
||||
|
||||
@@ -1,38 +1,21 @@
|
||||
// Copyright (c) Microsoft Corporation.
|
||||
// Licensed under the MIT license.
|
||||
|
||||
#include <atomic>
|
||||
#include <mscclpp/core.hpp>
|
||||
#include <mscclpp/gpu_utils.hpp>
|
||||
#include <mscclpp/numa.hpp>
|
||||
#include <mscclpp/proxy.hpp>
|
||||
#include <mscclpp/utils.hpp>
|
||||
#include <thread>
|
||||
|
||||
#include "api.h"
|
||||
#include "debug.h"
|
||||
#include "logger.hpp"
|
||||
#include "proxy_impl.hpp"
|
||||
|
||||
namespace mscclpp {
|
||||
|
||||
constexpr int ProxyStopCheckPeriod = 1000;
|
||||
constexpr int ProxyStartWarnPeriod = 1000;
|
||||
|
||||
struct Proxy::Impl {
|
||||
ProxyHandler handler;
|
||||
std::function<void()> threadInit;
|
||||
std::shared_ptr<Fifo> fifo;
|
||||
std::atomic_bool threadStarted;
|
||||
std::thread service;
|
||||
std::atomic_bool running;
|
||||
|
||||
Impl(ProxyHandler handler, std::function<void()> threadInit, int fifoSize)
|
||||
: handler(handler),
|
||||
threadInit(threadInit),
|
||||
fifo(std::make_shared<Fifo>(fifoSize)),
|
||||
threadStarted(false),
|
||||
running(false) {}
|
||||
};
|
||||
|
||||
MSCCLPP_API_CPP Proxy::Proxy(ProxyHandler handler, std::function<void()> threadInit, int fifoSize) {
|
||||
pimpl_ = std::make_unique<Impl>(handler, threadInit, fifoSize);
|
||||
}
|
||||
@@ -70,18 +53,23 @@ MSCCLPP_API_CPP void Proxy::start(bool blocking) {
|
||||
|
||||
pimpl_->threadStarted.store(true, std::memory_order_release);
|
||||
|
||||
ProxyHandler handler = this->pimpl_->handler;
|
||||
auto fifo = this->pimpl_->fifo;
|
||||
ProxyHandler handler = pimpl_->handler;
|
||||
auto progressHandler = pimpl_->progressHandler;
|
||||
auto fifo = pimpl_->fifo;
|
||||
ProxyTrigger trigger;
|
||||
|
||||
int runCnt = ProxyStopCheckPeriod;
|
||||
for (;;) {
|
||||
if (runCnt-- == 0) {
|
||||
runCnt = ProxyStopCheckPeriod;
|
||||
if (!this->pimpl_->running.load(std::memory_order_acquire)) {
|
||||
if (!pimpl_->running.load(std::memory_order_acquire)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Per-iteration system work (e.g. progressing pending async flushes),
|
||||
// run regardless of whether a trigger is ready this iteration.
|
||||
if (progressHandler) progressHandler();
|
||||
|
||||
// Poll to see if we are ready to send anything
|
||||
trigger = fifo->poll();
|
||||
if (trigger.fst == 0 || trigger.snd == 0) { // TODO: this check is a potential pitfall for custom triggers
|
||||
@@ -107,7 +95,7 @@ MSCCLPP_API_CPP void Proxy::start(bool blocking) {
|
||||
count--;
|
||||
if (count == 0) {
|
||||
count = ProxyStartWarnPeriod;
|
||||
WARN("Proxy thread startup taking longer than expected.");
|
||||
WARN(CONN, "Proxy thread startup taking longer than expected.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -248,6 +248,9 @@ TokenPool::TokenPool(size_t nToken) : nToken_(nToken) {
|
||||
|
||||
std::shared_ptr<uint64_t> TokenPool::getToken() {
|
||||
auto deleter = [self = shared_from_this()](uint64_t* token) {
|
||||
// Zero the slot on release so the next allocator hands out a clean
|
||||
// semaphore counter (matches a freshly-allocated slot).
|
||||
mscclpp::gpuMemset(token, 0, sizeof(uint64_t));
|
||||
size_t index = (token - self->baseAddr_) / UINT64_WIDTH;
|
||||
size_t bit = (token - self->baseAddr_) % UINT64_WIDTH;
|
||||
uint64_t mask = 1UL << bit;
|
||||
|
||||
@@ -113,7 +113,9 @@ AlgorithmCollection AlgorithmCollectionBuilder::buildDefaultDslAlgorithms(int ra
|
||||
};
|
||||
static const std::vector<DslAlgoConfig> defaultAlgoConfigs = {
|
||||
{"allreduce_2nodes_1K_64K.json", "allreduce", 8, 16, {{"default", 1}}},
|
||||
{"allreduce_2nodes_64K_2M.json", "allreduce", 8, 16, {{"default", 1}}}};
|
||||
{"allreduce_2nodes_128K_2M.json", "allreduce", 8, 16, {{"default", 1}}},
|
||||
{"allreduce_4nodes_1K_64K.json", "allreduce", 8, 32, {{"default", 1}}},
|
||||
{"allreduce_4nodes_128K_2M.json", "allreduce", 4, 64, {{"default", 1}}}};
|
||||
AlgorithmCollection collection;
|
||||
|
||||
static auto generateFileId = [](const std::string& input) {
|
||||
|
||||
@@ -200,7 +200,8 @@ inline std::pair<int, int> getDefaultBlockNumAndThreadNum(size_t inputSize, int
|
||||
{
|
||||
bool isFp8 = dtype == DataType::FLOAT8_E4M3B15;
|
||||
#if defined(__FP8_TYPES_EXIST__)
|
||||
isFp8 = isFp8 || dtype == DataType::FLOAT8_E4M3 || dtype == DataType::FLOAT8_E5M2;
|
||||
isFp8 = isFp8 || dtype == DataType::FLOAT8_E4M3FN || dtype == DataType::FLOAT8_E4M3FNUZ ||
|
||||
dtype == DataType::FLOAT8_E5M2 || dtype == DataType::FLOAT8_E5M2FNUZ;
|
||||
#endif
|
||||
if (isFp8) {
|
||||
if (inputSize < (64 << 10)) {
|
||||
@@ -310,4 +311,4 @@ std::shared_ptr<Algorithm> AllreducePacket::build() {
|
||||
}
|
||||
|
||||
} // namespace collective
|
||||
} // namespace mscclpp
|
||||
} // namespace mscclpp
|
||||
|
||||
@@ -101,10 +101,20 @@ AllreduceFunc dispatchByDtype(mscclpp::DataType dtype, mscclpp::DataType accumDt
|
||||
return Adapter<Op, __bfloat16, __bfloat16>::call;
|
||||
#endif
|
||||
#if defined(__FP8_TYPES_EXIST__)
|
||||
} else if (dtype == mscclpp::DataType::FLOAT8_E4M3) {
|
||||
#if defined(__FP8_E4M3_IS_FNUZ__)
|
||||
} else if (dtype == mscclpp::DataType::FLOAT8_E4M3FNUZ) {
|
||||
return dispatchFp8Accum<Op, __fp8_e4m3, Adapter>(accumDtype, dtype);
|
||||
#else
|
||||
} else if (dtype == mscclpp::DataType::FLOAT8_E4M3FN) {
|
||||
return dispatchFp8Accum<Op, __fp8_e4m3, Adapter>(accumDtype, dtype);
|
||||
#endif
|
||||
#if defined(__FP8_E5M2_IS_FNUZ__)
|
||||
} else if (dtype == mscclpp::DataType::FLOAT8_E5M2FNUZ) {
|
||||
return dispatchFp8Accum<Op, __fp8_e5m2, Adapter>(accumDtype, dtype);
|
||||
#else
|
||||
} else if (dtype == mscclpp::DataType::FLOAT8_E5M2) {
|
||||
return dispatchFp8Accum<Op, __fp8_e5m2, Adapter>(accumDtype, dtype);
|
||||
#endif
|
||||
#endif
|
||||
} else if (dtype == mscclpp::DataType::FLOAT8_E4M3B15) {
|
||||
return dispatchFp8Accum<Op, __fp8_e4m3b15, Adapter>(accumDtype, dtype);
|
||||
@@ -125,4 +135,4 @@ AllreduceFunc dispatch(ReduceOp op, mscclpp::DataType dtype, mscclpp::DataType a
|
||||
} // namespace collective
|
||||
} // namespace mscclpp
|
||||
|
||||
#endif // MSCCLPP_ALLREDUCE_COMMON_HPP_
|
||||
#endif // MSCCLPP_ALLREDUCE_COMMON_HPP_
|
||||
|
||||
@@ -20,7 +20,8 @@ static bool isNvlsSupportedForDataType(const AlgorithmSelectorConfig& config, Da
|
||||
return false;
|
||||
}
|
||||
|
||||
const bool isFp8 = dtype == DataType::FLOAT8_E4M3 || dtype == DataType::FLOAT8_E5M2;
|
||||
const bool isFp8 = dtype == DataType::FLOAT8_E4M3FN || dtype == DataType::FLOAT8_E4M3FNUZ ||
|
||||
dtype == DataType::FLOAT8_E5M2 || dtype == DataType::FLOAT8_E5M2FNUZ;
|
||||
|
||||
if (!isFp8) {
|
||||
return nvlsSupported;
|
||||
|
||||
@@ -28,12 +28,21 @@ inline mscclpp::DataType ncclDataTypeToMscclpp(ncclDataType_t dtype) {
|
||||
return mscclpp::DataType::BFLOAT16;
|
||||
#ifdef __FP8_TYPES_EXIST__
|
||||
case ncclFloat8e4m3:
|
||||
return mscclpp::DataType::FLOAT8_E4M3;
|
||||
#if defined(__FP8_E4M3_IS_FNUZ__)
|
||||
return mscclpp::DataType::FLOAT8_E4M3FNUZ;
|
||||
#else
|
||||
return mscclpp::DataType::FLOAT8_E4M3FN;
|
||||
#endif
|
||||
case ncclFloat8e5m2:
|
||||
#if defined(__FP8_E5M2_IS_FNUZ__)
|
||||
return mscclpp::DataType::FLOAT8_E5M2FNUZ;
|
||||
#else
|
||||
return mscclpp::DataType::FLOAT8_E5M2;
|
||||
#endif
|
||||
#endif
|
||||
default:
|
||||
throw mscclpp::Error("Unsupported ncclDataType_t: " + std::to_string(dtype), mscclpp::ErrorCode::InvalidUsage);
|
||||
THROW(mscclpp::LogSubsys::NCCL, mscclpp::Error, mscclpp::ErrorCode::InvalidUsage,
|
||||
"Unsupported ncclDataType_t: " + std::to_string(dtype));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -41,8 +50,10 @@ inline mscclpp::DataType ncclDataTypeToMscclpp(ncclDataType_t dtype) {
|
||||
inline size_t getDataTypeSize(mscclpp::DataType dtype) {
|
||||
switch (dtype) {
|
||||
case mscclpp::DataType::UINT8:
|
||||
case mscclpp::DataType::FLOAT8_E4M3:
|
||||
case mscclpp::DataType::FLOAT8_E4M3FN:
|
||||
case mscclpp::DataType::FLOAT8_E4M3FNUZ:
|
||||
case mscclpp::DataType::FLOAT8_E5M2:
|
||||
case mscclpp::DataType::FLOAT8_E5M2FNUZ:
|
||||
case mscclpp::DataType::FLOAT8_E4M3B15:
|
||||
return 1;
|
||||
case mscclpp::DataType::FLOAT16:
|
||||
@@ -72,10 +83,32 @@ static inline ncclDataType_t mscclppToNcclDataType(mscclpp::DataType dtype) {
|
||||
case mscclpp::DataType::BFLOAT16:
|
||||
return ncclBfloat16;
|
||||
#ifdef __FP8_TYPES_EXIST__
|
||||
case mscclpp::DataType::FLOAT8_E4M3:
|
||||
#if defined(__FP8_E4M3_IS_FNUZ__)
|
||||
case mscclpp::DataType::FLOAT8_E4M3FNUZ:
|
||||
return ncclFloat8e4m3;
|
||||
case mscclpp::DataType::FLOAT8_E4M3FN:
|
||||
THROW(mscclpp::LogSubsys::NCCL, mscclpp::Error, mscclpp::ErrorCode::InvalidUsage,
|
||||
"FLOAT8_E4M3FN is not natively supported on this platform; use FLOAT8_E4M3FNUZ for NCCL collectives");
|
||||
#else
|
||||
case mscclpp::DataType::FLOAT8_E4M3FN:
|
||||
return ncclFloat8e4m3;
|
||||
case mscclpp::DataType::FLOAT8_E4M3FNUZ:
|
||||
THROW(mscclpp::LogSubsys::NCCL, mscclpp::Error, mscclpp::ErrorCode::InvalidUsage,
|
||||
"FLOAT8_E4M3FNUZ is not natively supported on this platform; use FLOAT8_E4M3FN for NCCL collectives");
|
||||
#endif
|
||||
#if defined(__FP8_E5M2_IS_FNUZ__)
|
||||
case mscclpp::DataType::FLOAT8_E5M2FNUZ:
|
||||
return ncclFloat8e5m2;
|
||||
case mscclpp::DataType::FLOAT8_E5M2:
|
||||
THROW(mscclpp::LogSubsys::NCCL, mscclpp::Error, mscclpp::ErrorCode::InvalidUsage,
|
||||
"FLOAT8_E5M2 is not natively supported on this platform; use FLOAT8_E5M2FNUZ for NCCL collectives");
|
||||
#else
|
||||
case mscclpp::DataType::FLOAT8_E5M2:
|
||||
return ncclFloat8e5m2;
|
||||
case mscclpp::DataType::FLOAT8_E5M2FNUZ:
|
||||
THROW(mscclpp::LogSubsys::NCCL, mscclpp::Error, mscclpp::ErrorCode::InvalidUsage,
|
||||
"FLOAT8_E5M2FNUZ is not natively supported on this platform; use FLOAT8_E5M2 for NCCL collectives");
|
||||
#endif
|
||||
#endif
|
||||
case mscclpp::DataType::FLOAT8_E4M3B15:
|
||||
// float8_e4m3b15 has no NCCL equivalent; NCCL cannot reduce this type correctly.
|
||||
@@ -98,4 +131,4 @@ inline mscclpp::ReduceOp ncclRedOpToMscclpp(ncclRedOp_t op) {
|
||||
}
|
||||
}
|
||||
|
||||
#endif // MSCCLPP_DATATYPE_CONVERSION_HPP_
|
||||
#endif // MSCCLPP_DATATYPE_CONVERSION_HPP_
|
||||
|
||||
@@ -161,6 +161,14 @@ class PortChannelOneToOneTest : public CommunicatorTestBase {
|
||||
void testPacketPingPongPerf(bool useIbOnly, IbMode ibMode = IbMode::Default);
|
||||
void testAtomicAdd(bool useIPC, bool useIb, bool useEthernet, IbMode ibMode = IbMode::Default);
|
||||
void testBandwidth(PingPongTestParams params);
|
||||
void setupMultiQpChannels(int numQps, size_t elemsPerChan, IbMode ibMode, int tagBase,
|
||||
std::vector<std::shared_ptr<int>>& sendBuffs,
|
||||
std::vector<mscclpp::RegisteredMemory>& localMems,
|
||||
std::vector<mscclpp::RegisteredMemory>& remoteMems,
|
||||
std::vector<mscclpp::PortChannel>& portChannels);
|
||||
void testMultiQpBandwidth(IbMode ibMode, int numQps);
|
||||
void testMultiQpFlushStress(IbMode ibMode, int numQps);
|
||||
void testSameChanConcurrentFlush(IbMode ibMode);
|
||||
|
||||
std::shared_ptr<mscclpp::ProxyService> proxyService;
|
||||
};
|
||||
|
||||
@@ -36,6 +36,18 @@ inline void requireGdrForIbMode(IbMode mode, mscclpp::Transport ibTransport) {
|
||||
#define REQUIRE_GDR_FOR_IB_MODE(mode) // No extra requirements on non-CUDA platforms.
|
||||
#endif
|
||||
|
||||
// Skip an IPC-only PortChannel test (useIPC=true, useIB=false, useEthernet=false) when CudaIpc
|
||||
// cannot connect this rank pair. CudaIpc works intra-node always, and cross-node only on MNNVL
|
||||
// systems (GB200 NVL72 + IMEX). The combined check is "at least 2 ranks per node" OR "fabric
|
||||
// (MNNVL) handles are usable on this system".
|
||||
#define REQUIRE_CUDA_IPC_AVAILABLE \
|
||||
do { \
|
||||
if (gEnv->nRanksPerNode < 2 && !mscclpp::isFabricMemHandleAvailable()) { \
|
||||
SKIP_TEST() << "CudaIpc requires intra-node ranks (nRanksPerNode>=2) or MNNVL fabric handles, \
|
||||
both unavailable here."; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
void PortChannelOneToOneTest::SetUp() {
|
||||
// Use only two ranks
|
||||
setNumRanksToUse(2);
|
||||
@@ -71,7 +83,10 @@ void PortChannelOneToOneTest::setupMeshConnections(std::vector<mscclpp::PortChan
|
||||
continue;
|
||||
}
|
||||
mscclpp::EndpointConfig cfg;
|
||||
if ((rankToNode(r) == rankToNode(gEnv->rank)) && useIPC) {
|
||||
if (useIPC) {
|
||||
// CudaIpc works intra-node always, and cross-node on MNNVL systems (GB200 NVL72 + IMEX)
|
||||
// via fabric handles. Tests that exercise CudaIpc across nodes on non-MNNVL hardware should
|
||||
// gate themselves with REQUIRE_CUDA_IPC_AVAILABLE; we always request CudaIpc here when asked.
|
||||
cfg.transport = mscclpp::Transport::CudaIpc;
|
||||
} else if (useIb) {
|
||||
cfg.transport = ibTransport;
|
||||
@@ -262,6 +277,7 @@ void PortChannelOneToOneTest::testPingPongPerf(PingPongTestParams params) {
|
||||
}
|
||||
|
||||
TEST(PortChannelOneToOneTest, PingPong) {
|
||||
REQUIRE_CUDA_IPC_AVAILABLE;
|
||||
testPingPong(PingPongTestParams{
|
||||
.useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Default});
|
||||
}
|
||||
@@ -279,6 +295,7 @@ TEST(PortChannelOneToOneTest, PingPongEthernet) {
|
||||
}
|
||||
|
||||
TEST(PortChannelOneToOneTest, PingPongWithPoll) {
|
||||
REQUIRE_CUDA_IPC_AVAILABLE;
|
||||
testPingPong(PingPongTestParams{
|
||||
.useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = true, .ibMode = IbMode::Default});
|
||||
}
|
||||
@@ -291,6 +308,7 @@ TEST(PortChannelOneToOneTest, PingPongIbHostModeWithPoll) {
|
||||
}
|
||||
|
||||
PERF_TEST(PortChannelOneToOneTest, PingPongPerf) {
|
||||
REQUIRE_CUDA_IPC_AVAILABLE;
|
||||
testPingPongPerf(PingPongTestParams{
|
||||
.useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Default});
|
||||
}
|
||||
@@ -482,7 +500,10 @@ void PortChannelOneToOneTest::testPacketPingPongPerf(bool useIb, IbMode ibMode)
|
||||
proxyService->stopProxy();
|
||||
}
|
||||
|
||||
TEST(PortChannelOneToOneTest, PacketPingPong) { testPacketPingPong(false, IbMode::Default); }
|
||||
TEST(PortChannelOneToOneTest, PacketPingPong) {
|
||||
REQUIRE_CUDA_IPC_AVAILABLE;
|
||||
testPacketPingPong(false, IbMode::Default);
|
||||
}
|
||||
|
||||
TEST(PortChannelOneToOneTest, PacketPingPongIbHostMode) {
|
||||
REQUIRE_IBVERBS;
|
||||
@@ -490,7 +511,10 @@ TEST(PortChannelOneToOneTest, PacketPingPongIbHostMode) {
|
||||
testPacketPingPong(true, IbMode::Host);
|
||||
}
|
||||
|
||||
PERF_TEST(PortChannelOneToOneTest, PacketPingPongPerf) { testPacketPingPongPerf(false, IbMode::Default); }
|
||||
PERF_TEST(PortChannelOneToOneTest, PacketPingPongPerf) {
|
||||
REQUIRE_CUDA_IPC_AVAILABLE;
|
||||
testPacketPingPongPerf(false, IbMode::Default);
|
||||
}
|
||||
|
||||
PERF_TEST(PortChannelOneToOneTest, PacketPingPongPerfIbHostMode) {
|
||||
REQUIRE_IBVERBS;
|
||||
@@ -583,6 +607,7 @@ void PortChannelOneToOneTest::testBandwidth(PingPongTestParams params) {
|
||||
}
|
||||
|
||||
PERF_TEST(PortChannelOneToOneTest, Bandwidth) {
|
||||
REQUIRE_CUDA_IPC_AVAILABLE;
|
||||
testBandwidth(PingPongTestParams{
|
||||
.useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Default});
|
||||
}
|
||||
@@ -639,6 +664,28 @@ __global__ void kernelPortChannelAtomicAddConcurrent(int64_t* localBuff, int nTr
|
||||
}
|
||||
}
|
||||
|
||||
static constexpr int kMaxQps = 4;
|
||||
__constant__ DeviceHandle<mscclpp::PortChannel> gMultiQpPortChans[kMaxQps];
|
||||
|
||||
// Multi-QP bandwidth kernel: barrier on QP 0 only, then putWithSignal on all QPs.
|
||||
// Only one signal/wait pair is needed for sync between two GPU kernels.
|
||||
__global__ void kernelMultiQpBandwidth(int nElemPerChan, int nIters, int numQps) {
|
||||
if (threadIdx.x != 0) return;
|
||||
for (int i = 0; i < nIters; i++) {
|
||||
// Barrier on QP 0 only — syncs both ranks
|
||||
gMultiQpPortChans[0].signal();
|
||||
gMultiQpPortChans[0].wait();
|
||||
// Data transfer: put on all QPs simultaneously
|
||||
for (int q = 0; q < numQps; q++) {
|
||||
gMultiQpPortChans[q].putWithSignal(0, nElemPerChan * sizeof(int));
|
||||
}
|
||||
// Wait for all remote data arrivals
|
||||
for (int q = 0; q < numQps; q++) {
|
||||
gMultiQpPortChans[q].wait();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void PortChannelOneToOneTest::testAtomicAdd(bool useIPC, bool useIb, bool useEthernet, IbMode ibMode) {
|
||||
if (gEnv->rank >= numRanksToUse) return;
|
||||
|
||||
@@ -725,3 +772,270 @@ TEST(PortChannelOneToOneTest, AtomicAddIbHostNoAtomicRejected) {
|
||||
|
||||
communicator->bootstrap()->barrier();
|
||||
}
|
||||
|
||||
// Multi-QP setup helper: bootstrap N parallel IB connections + port channels in two
|
||||
// futures-based phases (issue all async ops before resolving any, to avoid deadlock).
|
||||
// tagBase: distinct base used by each caller so concurrent tests don't clash on tags.
|
||||
void PortChannelOneToOneTest::setupMultiQpChannels(int numQps, size_t elemsPerChan, IbMode ibMode, int tagBase,
|
||||
std::vector<std::shared_ptr<int>>& sendBuffs,
|
||||
std::vector<mscclpp::RegisteredMemory>& localMems,
|
||||
std::vector<mscclpp::RegisteredMemory>& remoteMems,
|
||||
std::vector<mscclpp::PortChannel>& portChannels) {
|
||||
const int peer = 1 - communicator->bootstrap()->getRank();
|
||||
sendBuffs.assign(numQps, nullptr);
|
||||
localMems.assign(numQps, mscclpp::RegisteredMemory{});
|
||||
remoteMems.assign(numQps, mscclpp::RegisteredMemory{});
|
||||
portChannels.clear();
|
||||
|
||||
std::vector<std::shared_future<mscclpp::Connection>> connFutures(numQps);
|
||||
std::vector<std::shared_future<mscclpp::RegisteredMemory>> remoteMemFutures(numQps);
|
||||
|
||||
for (int q = 0; q < numQps; q++) {
|
||||
sendBuffs[q] = mscclpp::GpuBuffer<int>(elemsPerChan).memory();
|
||||
localMems[q] = communicator->registerMemory(sendBuffs[q].get(), elemsPerChan * sizeof(int), ibTransport);
|
||||
|
||||
mscclpp::EndpointConfig cfg;
|
||||
cfg.transport = ibTransport;
|
||||
cfg.ib.gidIndex = std::stoi(gEnv->args["ib_gid_index"]);
|
||||
cfg.ib.mode = ibMode;
|
||||
|
||||
connFutures[q] = communicator->connect(cfg, peer, tagBase + q);
|
||||
communicator->sendMemory(localMems[q], peer, tagBase + numQps + q);
|
||||
remoteMemFutures[q] = communicator->recvMemory(peer, tagBase + numQps + q);
|
||||
}
|
||||
|
||||
for (int q = 0; q < numQps; q++) {
|
||||
auto conn = connFutures[q].get();
|
||||
remoteMems[q] = remoteMemFutures[q].get();
|
||||
auto sema = communicator->buildSemaphore(conn, peer, tagBase + 2 * numQps + q).get();
|
||||
mscclpp::SemaphoreId cid = proxyService->addSemaphore(sema);
|
||||
portChannels.emplace_back(
|
||||
proxyService->portChannel(cid, proxyService->addMemory(remoteMems[q]), proxyService->addMemory(localMems[q])));
|
||||
}
|
||||
}
|
||||
|
||||
void PortChannelOneToOneTest::testMultiQpBandwidth(IbMode ibMode, int numQps) {
|
||||
if (gEnv->rank >= numRanksToUse) return;
|
||||
|
||||
const int rank = communicator->bootstrap()->getRank();
|
||||
const int maxElemPerChan = 32 * 1024 * 1024; // 128 MB per channel
|
||||
|
||||
std::vector<std::shared_ptr<int>> sendBuffs;
|
||||
std::vector<mscclpp::RegisteredMemory> localMems;
|
||||
std::vector<mscclpp::RegisteredMemory> remoteMems;
|
||||
std::vector<mscclpp::PortChannel> portChannels;
|
||||
setupMultiQpChannels(numQps, maxElemPerChan, ibMode, /*tagBase=*/100, sendBuffs, localMems, remoteMems, portChannels);
|
||||
|
||||
std::vector<DeviceHandle<mscclpp::PortChannel>> handles;
|
||||
for (auto& ch : portChannels) handles.push_back(ch.deviceHandle());
|
||||
ASSERT_EQ(handles.size(), static_cast<size_t>(numQps));
|
||||
ASSERT_LE(numQps, kMaxQps); // numQps must not exceed __constant__ array size (kMaxQps)
|
||||
MSCCLPP_CUDATHROW(
|
||||
cudaMemcpyToSymbol(gMultiQpPortChans, handles.data(), numQps * sizeof(DeviceHandle<mscclpp::PortChannel>)));
|
||||
|
||||
proxyService->startProxy();
|
||||
|
||||
const std::string testName = ::mscclpp::test::currentTestName();
|
||||
const std::string qpLabel = std::to_string(numQps) + " QP" + (numQps > 1 ? "s" : "");
|
||||
|
||||
for (int nElemPerChan :
|
||||
{256, 16 * 1024, 256 * 1024, 1024 * 1024, 4 * 1024 * 1024, 16 * 1024 * 1024, 32 * 1024 * 1024}) {
|
||||
int nIters = 10000;
|
||||
// Warm-up
|
||||
kernelMultiQpBandwidth<<<1, 1>>>(nElemPerChan, 10, numQps);
|
||||
MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
|
||||
communicator->bootstrap()->barrier();
|
||||
|
||||
// Measure
|
||||
mscclpp::Timer timer;
|
||||
kernelMultiQpBandwidth<<<1, 1>>>(nElemPerChan, nIters, numQps);
|
||||
MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
|
||||
double elapsedUs = timer.elapsed();
|
||||
communicator->bootstrap()->barrier();
|
||||
|
||||
if (rank == 0) {
|
||||
double totalBytes = (double)nElemPerChan * sizeof(int) * numQps;
|
||||
double elapsedMsPerIter = elapsedUs / 1e3 / nIters;
|
||||
double gbps = totalBytes / elapsedMsPerIter * 1e-6;
|
||||
double totalSizeKB = totalBytes / 1024.0;
|
||||
std::string label;
|
||||
if (totalSizeKB >= 1024.0)
|
||||
label = std::to_string((int)(totalSizeKB / 1024.0)) + " MB";
|
||||
else
|
||||
label = std::to_string((int)totalSizeKB) + " KB";
|
||||
::mscclpp::test::reportPerfResult(label + " (" + qpLabel + ")", gbps, "GB/s");
|
||||
}
|
||||
}
|
||||
|
||||
proxyService->stopProxy();
|
||||
|
||||
for (auto& m : localMems) registeredMemories.push_back(m);
|
||||
for (auto& m : remoteMems) registeredMemories.push_back(m);
|
||||
}
|
||||
|
||||
PERF_TEST(PortChannelOneToOneTest, MultiQpBandwidthIbHostMode) {
|
||||
REQUIRE_IBVERBS;
|
||||
REQUIRE_GDR_FOR_IB_MODE(IbMode::Host);
|
||||
for (int numQps : {1, 2, 4}) {
|
||||
testMultiQpBandwidth(IbMode::Host, numQps);
|
||||
}
|
||||
}
|
||||
|
||||
PERF_TEST(PortChannelOneToOneTest, MultiQpBandwidthIbHostNoAtomicMode) {
|
||||
REQUIRE_IBVERBS;
|
||||
REQUIRE_GDR_FOR_IB_MODE(IbMode::HostNoAtomic);
|
||||
for (int numQps : {1, 2, 4}) {
|
||||
testMultiQpBandwidth(IbMode::HostNoAtomic, numQps);
|
||||
}
|
||||
}
|
||||
|
||||
// Multi-QP flush-stress kernel: one thread per QP, all calling putWithSignalAndFlush
|
||||
// concurrently so all N CQ drains are in flight on the proxy thread at once.
|
||||
// This is the concurrent-flush worst case the async-progress design protects against.
|
||||
__global__ void kernelMultiQpFlushStress(int nElemPerChan, int nIters, int numQps) {
|
||||
int q = threadIdx.x;
|
||||
if (q >= numQps) return;
|
||||
for (int i = 0; i < nIters; i++) {
|
||||
if (q == 0) {
|
||||
gMultiQpPortChans[0].signal();
|
||||
gMultiQpPortChans[0].wait();
|
||||
}
|
||||
__syncthreads();
|
||||
gMultiQpPortChans[q].putWithSignalAndFlush(0, nElemPerChan * sizeof(int));
|
||||
gMultiQpPortChans[q].wait();
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
|
||||
void PortChannelOneToOneTest::testMultiQpFlushStress(IbMode ibMode, int numQps) {
|
||||
if (gEnv->rank >= numRanksToUse) return;
|
||||
|
||||
const int rank = communicator->bootstrap()->getRank();
|
||||
const int maxElemPerChan = 64 * 1024;
|
||||
|
||||
std::vector<std::shared_ptr<int>> sendBuffs;
|
||||
std::vector<mscclpp::RegisteredMemory> localMems;
|
||||
std::vector<mscclpp::RegisteredMemory> remoteMems;
|
||||
std::vector<mscclpp::PortChannel> portChannels;
|
||||
setupMultiQpChannels(numQps, maxElemPerChan, ibMode, /*tagBase=*/400, sendBuffs, localMems, remoteMems, portChannels);
|
||||
|
||||
std::vector<DeviceHandle<mscclpp::PortChannel>> handles;
|
||||
for (auto& ch : portChannels) handles.push_back(ch.deviceHandle());
|
||||
ASSERT_EQ(handles.size(), static_cast<size_t>(numQps));
|
||||
ASSERT_LE(numQps, kMaxQps);
|
||||
MSCCLPP_CUDATHROW(
|
||||
cudaMemcpyToSymbol(gMultiQpPortChans, handles.data(), numQps * sizeof(DeviceHandle<mscclpp::PortChannel>)));
|
||||
|
||||
proxyService->startProxy();
|
||||
|
||||
const std::string qpLabel = std::to_string(numQps) + " QP" + (numQps > 1 ? "s" : "");
|
||||
|
||||
for (int nElemPerChan : {256, 4 * 1024, 64 * 1024}) {
|
||||
int nIters = 2000;
|
||||
kernelMultiQpFlushStress<<<1, numQps>>>(nElemPerChan, 10, numQps);
|
||||
MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
|
||||
communicator->bootstrap()->barrier();
|
||||
|
||||
mscclpp::Timer timer;
|
||||
kernelMultiQpFlushStress<<<1, numQps>>>(nElemPerChan, nIters, numQps);
|
||||
MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
|
||||
double elapsedUs = timer.elapsed();
|
||||
communicator->bootstrap()->barrier();
|
||||
|
||||
if (rank == 0) {
|
||||
double usPerIter = elapsedUs / nIters;
|
||||
double usPerIterPerQp = usPerIter / numQps;
|
||||
int bytesPerChan = nElemPerChan * (int)sizeof(int);
|
||||
std::string sizeLabel = (bytesPerChan >= 1024) ? (std::to_string(bytesPerChan / 1024) + " KB")
|
||||
: (std::to_string(bytesPerChan) + " B");
|
||||
::mscclpp::test::reportPerfResult(sizeLabel + " (" + qpLabel + ") per-iter", usPerIter, "us");
|
||||
::mscclpp::test::reportPerfResult(sizeLabel + " (" + qpLabel + ") per-iter/QP", usPerIterPerQp, "us");
|
||||
}
|
||||
}
|
||||
|
||||
proxyService->stopProxy();
|
||||
|
||||
for (auto& m : localMems) registeredMemories.push_back(m);
|
||||
for (auto& m : remoteMems) registeredMemories.push_back(m);
|
||||
}
|
||||
|
||||
PERF_TEST(PortChannelOneToOneTest, MultiQpFlushStressIbHostMode) {
|
||||
REQUIRE_IBVERBS;
|
||||
REQUIRE_GDR_FOR_IB_MODE(IbMode::Host);
|
||||
for (int numQps : {1, 2, 4}) {
|
||||
testMultiQpFlushStress(IbMode::Host, numQps);
|
||||
}
|
||||
}
|
||||
|
||||
PERF_TEST(PortChannelOneToOneTest, MultiQpFlushStressIbHostNoAtomicMode) {
|
||||
REQUIRE_IBVERBS;
|
||||
REQUIRE_GDR_FOR_IB_MODE(IbMode::HostNoAtomic);
|
||||
for (int numQps : {1, 2, 4}) {
|
||||
testMultiQpFlushStress(IbMode::HostNoAtomic, numQps);
|
||||
}
|
||||
}
|
||||
|
||||
// Same-channel concurrent-flush kernel: N GPU threads on the same PortChannel each call
|
||||
// putWithSignalAndFlush in lockstep. Stresses the FIFO-position-based wait target so that
|
||||
// each caller waits on its own TriggerSync rather than on a globally-incrementing counter
|
||||
// that could be assigned out-of-order relative to the FIFO push order.
|
||||
__constant__ DeviceHandle<mscclpp::PortChannel> gSingleChanForConcurrentFlush;
|
||||
|
||||
__global__ void kernelSameChanConcurrentFlush(int nIters) {
|
||||
auto& chan = gSingleChanForConcurrentFlush;
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < nIters; i++) {
|
||||
// Each thread writes to a distinct slot (so puts don't overlap on remote side),
|
||||
// then concurrently flushes on the same channel.
|
||||
uint64_t offset = tid * sizeof(int);
|
||||
chan.putWithSignalAndFlush(offset, offset, sizeof(int));
|
||||
// Each thread waits for one signal from the remote rank's symmetric putWithSignalAndFlush.
|
||||
chan.wait();
|
||||
}
|
||||
}
|
||||
|
||||
void PortChannelOneToOneTest::testSameChanConcurrentFlush(IbMode ibMode) {
|
||||
if (gEnv->rank >= numRanksToUse) return;
|
||||
|
||||
constexpr int nThreads = 4;
|
||||
std::vector<std::shared_ptr<int>> sendBuffs;
|
||||
std::vector<mscclpp::RegisteredMemory> localMems;
|
||||
std::vector<mscclpp::RegisteredMemory> remoteMems;
|
||||
std::vector<mscclpp::PortChannel> portChannels;
|
||||
setupMultiQpChannels(/*numQps=*/1, /*elemsPerChan=*/nThreads, ibMode, /*tagBase=*/700, sendBuffs, localMems,
|
||||
remoteMems, portChannels);
|
||||
|
||||
DeviceHandle<mscclpp::PortChannel> handle = portChannels[0].deviceHandle();
|
||||
MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gSingleChanForConcurrentFlush, &handle, sizeof(handle)));
|
||||
|
||||
proxyService->startProxy();
|
||||
|
||||
// Warm-up
|
||||
kernelSameChanConcurrentFlush<<<1, nThreads>>>(10);
|
||||
MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
|
||||
communicator->bootstrap()->barrier();
|
||||
|
||||
// Measure: a successful completion (no deadlock, no CQ error) validates that each
|
||||
// concurrent-flush caller waited on its own TriggerSync (not someone else's earlier one).
|
||||
const int nIters = 500;
|
||||
mscclpp::Timer timer;
|
||||
kernelSameChanConcurrentFlush<<<1, nThreads>>>(nIters);
|
||||
MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
|
||||
double elapsedUs = timer.elapsed();
|
||||
communicator->bootstrap()->barrier();
|
||||
|
||||
if (communicator->bootstrap()->getRank() == 0) {
|
||||
double usPerIter = elapsedUs / nIters;
|
||||
::mscclpp::test::reportPerfResult(std::to_string(nThreads) + " threads same-chan per-iter", usPerIter, "us");
|
||||
}
|
||||
|
||||
proxyService->stopProxy();
|
||||
for (auto& m : localMems) registeredMemories.push_back(m);
|
||||
for (auto& m : remoteMems) registeredMemories.push_back(m);
|
||||
}
|
||||
|
||||
TEST(PortChannelOneToOneTest, SameChanConcurrentFlushIbHostMode) {
|
||||
REQUIRE_IBVERBS;
|
||||
REQUIRE_GDR_FOR_IB_MODE(IbMode::Host);
|
||||
testSameChanConcurrentFlush(IbMode::Host);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user