mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-24 23:06:17 +00:00
Merge branch 'main' into binyli/mnnvl
This commit is contained in:
@@ -46,6 +46,15 @@ std::string getIBDeviceName(Transport ibTransport);
|
||||
/// @return The InfiniBand transport associated with the specified device name.
|
||||
Transport getIBTransportByDeviceName(const std::string& ibDeviceName);
|
||||
|
||||
/// Check whether this process can allocate/import CUDA memory with NVIDIA fabric handles
|
||||
/// (`CU_MEM_HANDLE_TYPE_FABRIC`). Fabric handles enable cross-node `Transport::CudaIpc` on
|
||||
/// MNNVL systems (e.g., GB200 NVL72) when the IMEX service is running. Returns `false` on
|
||||
/// hardware/software stacks without MNNVL+IMEX, in which case `Transport::CudaIpc` is
|
||||
/// restricted to ranks within the same node.
|
||||
///
|
||||
/// @return `true` if fabric handles are usable from this process, `false` otherwise.
|
||||
bool isFabricMemHandleAvailable();
|
||||
|
||||
} // namespace mscclpp
|
||||
|
||||
#endif // MSCCLPP_UTILS_HPP_
|
||||
|
||||
@@ -24,6 +24,8 @@ def parse_dtype(dtype_str):
|
||||
dtype_str = dtype_str.strip().lower()
|
||||
if dtype_str == "float16":
|
||||
return cp.float16
|
||||
elif dtype_str in ("bfloat16", "bf16"):
|
||||
return cp.float16 # same 2-byte size; mscclpp DataType is resolved from dtype_str
|
||||
elif dtype_str == "float32":
|
||||
return cp.float32
|
||||
elif dtype_str == "int32":
|
||||
@@ -119,15 +121,18 @@ def parse_size(size_str):
|
||||
return int(size_str)
|
||||
|
||||
|
||||
def dtype_to_mscclpp_dtype(dtype):
|
||||
if dtype == cp.float16:
|
||||
def dtype_to_mscclpp_dtype(dtype_str):
|
||||
dtype_str = dtype_str.strip().lower()
|
||||
if dtype_str == "float16":
|
||||
return DataType.float16
|
||||
elif dtype == cp.float32:
|
||||
elif dtype_str in ("bfloat16", "bf16"):
|
||||
return DataType.bfloat16
|
||||
elif dtype_str == "float32":
|
||||
return DataType.float32
|
||||
elif dtype == cp.int32:
|
||||
elif dtype_str == "int32":
|
||||
return DataType.int32
|
||||
else:
|
||||
raise ValueError(f"Unknown data type: {dtype}")
|
||||
raise ValueError(f"Unknown data type: {dtype_str}")
|
||||
|
||||
|
||||
def build_bufs(
|
||||
@@ -205,7 +210,7 @@ def main(
|
||||
result_buf.data.ptr,
|
||||
input_buf.nbytes,
|
||||
result_buf.nbytes,
|
||||
dtype_to_mscclpp_dtype(dtype),
|
||||
dtype_to_mscclpp_dtype(dtype_str),
|
||||
execution_plan,
|
||||
stream.ptr,
|
||||
packet_type,
|
||||
@@ -231,7 +236,7 @@ def main(
|
||||
npkit.shutdown()
|
||||
print(
|
||||
f"Rank: {mscclpp_group.my_rank} Execution time: {execution_time} us, "
|
||||
f"data size: {result_buf.nbytes} bytes data type: {dtype().dtype.name} "
|
||||
f"data size: {result_buf.nbytes} bytes data type: {dtype_str} "
|
||||
f"packet type: {packet_type}"
|
||||
)
|
||||
executor = None
|
||||
@@ -243,7 +248,7 @@ if __name__ == "__main__":
|
||||
parser.add_argument("-path", "--execution_plan_path", type=str, required=True)
|
||||
parser.add_argument("--size", type=str, required=True)
|
||||
parser.add_argument("--in_place", action="store_true", help="flag to define an in-place operation")
|
||||
parser.add_argument("--dtype", type=str, default="float16", help="Choose from float16, float32, int32")
|
||||
parser.add_argument("--dtype", type=str, default="float16", help="Choose from float16, bfloat16, float32, int32")
|
||||
parser.add_argument("--packet_type", type=str, default="LL16", help="Choose from LL8, LL16")
|
||||
parser.add_argument("--n_iters", type=int, default=10)
|
||||
parser.add_argument("--n_graph_iters", type=int, default=10)
|
||||
|
||||
@@ -4,8 +4,10 @@
|
||||
#include <assert.h>
|
||||
|
||||
#if defined(__HIP_PLATFORM_AMD__)
|
||||
#include <hip/hip_bfloat16.h>
|
||||
#include <hip/hip_fp16.h>
|
||||
#else
|
||||
#include <cuda_bf16.h>
|
||||
#include <cuda_fp16.h>
|
||||
#endif
|
||||
|
||||
@@ -30,6 +32,7 @@ static __device__ unsigned int ranqd1(unsigned int seed) {
|
||||
} \
|
||||
}
|
||||
|
||||
FILL_DATA(bfloat16, __nv_bfloat16)
|
||||
FILL_DATA(float16, __half)
|
||||
FILL_DATA(float32, float)
|
||||
FILL_DATA(int32, int)
|
||||
@@ -48,11 +51,12 @@ FILL_DATA(int32, int)
|
||||
} \
|
||||
}
|
||||
|
||||
TEST_DATA_ALL_GATHER(bfloat16, __nv_bfloat16)
|
||||
TEST_DATA_ALL_GATHER(float16, __half)
|
||||
TEST_DATA_ALL_GATHER(float32, float)
|
||||
TEST_DATA_ALL_GATHER(int32, int)
|
||||
|
||||
#define TEST_DATA_ALL_REDUCE(FuncNameType, DataType) \
|
||||
#define TEST_DATA_ALL_REDUCE(FuncNameType, DataType, Eps) \
|
||||
extern "C" __global__ void __launch_bounds__(1024, 1) test_data_all_reduce_##FuncNameType( \
|
||||
DataType* result_buf, DataType* test_buf, size_t num_elems, int num_ranks, int my_rank, int seq) { \
|
||||
for (int rank = 0; rank < num_ranks; rank++) { \
|
||||
@@ -66,15 +70,19 @@ TEST_DATA_ALL_GATHER(int32, int)
|
||||
} \
|
||||
} \
|
||||
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < num_elems; i += blockDim.x * gridDim.x) { \
|
||||
assert(abs(float(result_buf[i]) - float(test_buf[i])) < 1e-3 * num_ranks); \
|
||||
float expected = float(test_buf[i]); \
|
||||
float result = float(result_buf[i]); \
|
||||
float tol = Eps * num_ranks * (1.0f + abs(expected)); \
|
||||
assert(abs(result - expected) <= tol); \
|
||||
} \
|
||||
}
|
||||
|
||||
TEST_DATA_ALL_REDUCE(float16, __half)
|
||||
TEST_DATA_ALL_REDUCE(float32, float)
|
||||
TEST_DATA_ALL_REDUCE(int32, int)
|
||||
TEST_DATA_ALL_REDUCE(bfloat16, __nv_bfloat16, 7.8125e-3f)
|
||||
TEST_DATA_ALL_REDUCE(float16, __half, 9.765625e-4f)
|
||||
TEST_DATA_ALL_REDUCE(float32, float, 1.1920929e-7f)
|
||||
TEST_DATA_ALL_REDUCE(int32, int, 0.0f)
|
||||
|
||||
#define TEST_DATA_REDUCE_SCATTER(FuncNameType, DataType) \
|
||||
#define TEST_DATA_REDUCE_SCATTER(FuncNameType, DataType, Eps) \
|
||||
extern "C" __global__ void __launch_bounds__(1024, 1) test_data_reduce_scatter_##FuncNameType( \
|
||||
DataType* result_buf, DataType* test_buf, size_t num_elems, int num_ranks, int my_rank, int seq) { \
|
||||
int nem_elems_per_rank = num_elems / num_ranks; \
|
||||
@@ -91,14 +99,18 @@ TEST_DATA_ALL_REDUCE(int32, int)
|
||||
} \
|
||||
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < num_elems; i += blockDim.x * gridDim.x) { \
|
||||
if (i >= offset && i < offset + nem_elems_per_rank) { \
|
||||
assert(abs(float(result_buf[i - offset]) - float(test_buf[i])) < 1e-3 * num_ranks); \
|
||||
float expected = float(test_buf[i]); \
|
||||
float result = float(result_buf[i - offset]); \
|
||||
float tol = Eps * num_ranks * (1.0f + abs(expected)); \
|
||||
assert(abs(result - expected) <= tol); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
TEST_DATA_REDUCE_SCATTER(float16, __half)
|
||||
TEST_DATA_REDUCE_SCATTER(float32, float)
|
||||
TEST_DATA_REDUCE_SCATTER(int32, int)
|
||||
TEST_DATA_REDUCE_SCATTER(bfloat16, __nv_bfloat16, 7.8125e-3f)
|
||||
TEST_DATA_REDUCE_SCATTER(float16, __half, 9.765625e-4f)
|
||||
TEST_DATA_REDUCE_SCATTER(float32, float, 1.1920929e-7f)
|
||||
TEST_DATA_REDUCE_SCATTER(int32, int, 0.0f)
|
||||
|
||||
#define TEST_DATA_ALL_TO_ALL(FuncNameType, DataType) \
|
||||
extern "C" __global__ void __launch_bounds__(1024, 1) test_data_all_to_all_##FuncNameType( \
|
||||
@@ -118,6 +130,7 @@ TEST_DATA_REDUCE_SCATTER(int32, int)
|
||||
} \
|
||||
}
|
||||
|
||||
TEST_DATA_ALL_TO_ALL(bfloat16, __nv_bfloat16)
|
||||
TEST_DATA_ALL_TO_ALL(float16, __half)
|
||||
TEST_DATA_ALL_TO_ALL(float32, float)
|
||||
TEST_DATA_ALL_TO_ALL(int32, int)
|
||||
@@ -82,6 +82,17 @@ MSCCLPP_API_CPP int Connection::getMaxWriteQueueSize() const { return impl_->get
|
||||
CudaIpcConnection::CudaIpcConnection(std::shared_ptr<Context> context, const Endpoint& localEndpoint,
|
||||
const Endpoint& remoteEndpoint)
|
||||
: BaseConnection(context, localEndpoint) {
|
||||
// Log fabric/MNNVL availability exactly once per process so any later cross-node CudaIpc failure
|
||||
// is easy to triage. C++11 magic statics make this thread-safe without an explicit mutex.
|
||||
// NOTE: assigning the message to a std::string first avoids the logger's pointer-formatting
|
||||
// overload from kicking in on the const char* result of the ternary.
|
||||
[[maybe_unused]] static const bool fabricAvailable_ = []() {
|
||||
const bool avail = isFabricMemHandleAvailable();
|
||||
const std::string status = avail ? "available (cross-node CudaIpc via MNNVL/IMEX is supported)"
|
||||
: "NOT available (CudaIpc is restricted to intra-node ranks on this system)";
|
||||
INFO(CONN, "CudaIpc transport selected: fabric handles ", status);
|
||||
return avail;
|
||||
}();
|
||||
if (localEndpoint.transport() != Transport::CudaIpc || remoteEndpoint.transport() != Transport::CudaIpc) {
|
||||
THROW(CONN, Error, ErrorCode::InternalError, "CudaIpc transport is required for CudaIpcConnection");
|
||||
}
|
||||
|
||||
@@ -4,12 +4,11 @@
|
||||
#include "context.hpp"
|
||||
|
||||
#include <mscclpp/env.hpp>
|
||||
#include <sstream>
|
||||
|
||||
#include "api.h"
|
||||
#include "connection.hpp"
|
||||
#include "debug.h"
|
||||
#include "endpoint.hpp"
|
||||
#include "logger.hpp"
|
||||
#include "registered_memory.hpp"
|
||||
|
||||
namespace mscclpp {
|
||||
@@ -78,19 +77,17 @@ MSCCLPP_API_CPP Endpoint Context::createEndpoint(EndpointConfig config) {
|
||||
|
||||
MSCCLPP_API_CPP Connection Context::connect(const Endpoint& localEndpoint, const Endpoint& remoteEndpoint) {
|
||||
if (localEndpoint.device().type == DeviceType::GPU && localEndpoint.device().id < 0) {
|
||||
throw Error("No GPU device ID provided for local endpoint", ErrorCode::InvalidUsage);
|
||||
THROW(CONN, Error, ErrorCode::InvalidUsage, "No GPU device ID provided for local endpoint");
|
||||
}
|
||||
if (remoteEndpoint.device().type == DeviceType::GPU && remoteEndpoint.device().id < 0) {
|
||||
throw Error("No GPU device ID provided for remote endpoint", ErrorCode::InvalidUsage);
|
||||
THROW(CONN, Error, ErrorCode::InvalidUsage, "No GPU device ID provided for remote endpoint");
|
||||
}
|
||||
auto localTransport = localEndpoint.transport();
|
||||
auto remoteTransport = remoteEndpoint.transport();
|
||||
if (localTransport != remoteTransport &&
|
||||
!(AllIBTransports.has(localTransport) && AllIBTransports.has(remoteTransport))) {
|
||||
std::stringstream ss;
|
||||
ss << "Transport mismatch between local (" << localTransport << ") and remote (" << remoteEndpoint.transport()
|
||||
<< ") endpoints";
|
||||
throw Error(ss.str(), ErrorCode::InvalidUsage);
|
||||
THROW(CONN, Error, ErrorCode::InvalidUsage, "Transport mismatch between local (", localTransport, ") and remote (",
|
||||
remoteTransport, ") endpoints");
|
||||
}
|
||||
std::shared_ptr<BaseConnection> conn;
|
||||
if (localTransport == Transport::CudaIpc) {
|
||||
@@ -100,7 +97,9 @@ MSCCLPP_API_CPP Connection Context::connect(const Endpoint& localEndpoint, const
|
||||
} else if (localTransport == Transport::Ethernet) {
|
||||
conn = std::make_shared<EthernetConnection>(shared_from_this(), localEndpoint, remoteEndpoint);
|
||||
} else {
|
||||
throw Error("Unsupported transport", ErrorCode::InternalError);
|
||||
THROW(CONN, Error, ErrorCode::InternalError, "Unsupported transport: ", localTransport,
|
||||
" (this usually means EndpointConfig.transport was left at Transport::Unknown — "
|
||||
"set it explicitly to CudaIpc, an IB transport, or Ethernet)");
|
||||
}
|
||||
return Connection(conn);
|
||||
}
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
|
||||
#include <cstring>
|
||||
#include <mscclpp/gpu_utils.hpp>
|
||||
#include <mscclpp/utils.hpp>
|
||||
|
||||
#include "logger.hpp"
|
||||
#include "unix_socket.hpp"
|
||||
@@ -35,7 +36,7 @@ std::ostream& operator<<(std::ostream& os, const GpuIpcMemHandle::TypeFlags& typ
|
||||
return os;
|
||||
}
|
||||
|
||||
[[maybe_unused]] static bool isFabricMemHandleAvailable() {
|
||||
bool isFabricMemHandleAvailable() {
|
||||
#if (CUDA_NVLS_API_AVAILABLE)
|
||||
static int resultCache = -1; // -1: uninitialized, 0: not available, 1: available
|
||||
if (resultCache != -1) {
|
||||
@@ -283,11 +284,19 @@ GpuIpcMem::GpuIpcMem(const GpuIpcMemHandle& handle)
|
||||
THROW(GPU, Error, ErrorCode::InvalidUsage, "GpuIpcMemHandle type is None, cannot create GpuIpcMem");
|
||||
}
|
||||
if ((type_ == GpuIpcMemHandle::Type::None) && (handle_.typeFlags & GpuIpcMemHandle::Type::Fabric)) {
|
||||
if (cuMemImportFromShareableHandle(&allocHandle_, (void*)handle_.fabric.handle, CU_MEM_HANDLE_TYPE_FABRIC) ==
|
||||
CUDA_SUCCESS) {
|
||||
CUresult res =
|
||||
cuMemImportFromShareableHandle(&allocHandle_, (void*)handle_.fabric.handle, CU_MEM_HANDLE_TYPE_FABRIC);
|
||||
if (res == CUDA_SUCCESS) {
|
||||
// Ignore allocHandle in the handle struct since it is process-local and not transferable across processes.
|
||||
handle_.fabric.allocHandle = {};
|
||||
type_ = GpuIpcMemHandle::Type::Fabric;
|
||||
} else {
|
||||
const char* errStr = nullptr;
|
||||
(void)cuGetErrorString(res, &errStr);
|
||||
const std::string errMsg = errStr ? std::string(errStr) : std::string("unknown CUDA error");
|
||||
WARN(GPU, "Fabric IPC handle import failed (", errMsg,
|
||||
"); cross-node CudaIpc requires NVIDIA MNNVL hardware and a running IMEX service. ",
|
||||
"Falling back to other handle types if available.");
|
||||
}
|
||||
}
|
||||
if ((type_ == GpuIpcMemHandle::Type::None) && (handle_.typeFlags & GpuIpcMemHandle::Type::PosixFd)) {
|
||||
@@ -303,7 +312,17 @@ GpuIpcMem::GpuIpcMem(const GpuIpcMemHandle& handle)
|
||||
type_ = GpuIpcMemHandle::Type::RuntimeIpc;
|
||||
}
|
||||
if (type_ == GpuIpcMemHandle::Type::None) {
|
||||
THROW(GPU, Error, ErrorCode::Aborted, "Failed to open GpuIpcMemHandle (type: ", handle_.typeFlags, ")");
|
||||
const bool fabricOnly = (handle_.typeFlags == GpuIpcMemHandle::Type::Fabric);
|
||||
const std::string hint = fabricOnly
|
||||
? std::string(
|
||||
"The remote rank sent only a Fabric (MNNVL) handle, but this rank could not "
|
||||
"import it. Check that the IMEX daemon is running on both nodes and that the "
|
||||
"GPUs share an NVLink fabric.")
|
||||
: std::string(
|
||||
"All handle types failed to import; check IMEX service and POSIX FD socket "
|
||||
"availability.");
|
||||
THROW(GPU, Error, ErrorCode::Aborted, "Failed to open GpuIpcMemHandle (offered types: ", handle_.typeFlags, "). ",
|
||||
hint);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -36,6 +36,18 @@ inline void requireGdrForIbMode(IbMode mode, mscclpp::Transport ibTransport) {
|
||||
#define REQUIRE_GDR_FOR_IB_MODE(mode) // No extra requirements on non-CUDA platforms.
|
||||
#endif
|
||||
|
||||
// Skip an IPC-only PortChannel test (useIPC=true, useIB=false, useEthernet=false) when CudaIpc
|
||||
// cannot connect this rank pair. CudaIpc works intra-node always, and cross-node only on MNNVL
|
||||
// systems (GB200 NVL72 + IMEX). The combined check is "at least 2 ranks per node" OR "fabric
|
||||
// (MNNVL) handles are usable on this system".
|
||||
#define REQUIRE_CUDA_IPC_AVAILABLE \
|
||||
do { \
|
||||
if (gEnv->nRanksPerNode < 2 && !mscclpp::isFabricMemHandleAvailable()) { \
|
||||
SKIP_TEST() << "CudaIpc requires intra-node ranks (nRanksPerNode>=2) or MNNVL fabric handles, \
|
||||
both unavailable here."; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
void PortChannelOneToOneTest::SetUp() {
|
||||
// Use only two ranks
|
||||
setNumRanksToUse(2);
|
||||
@@ -71,7 +83,10 @@ void PortChannelOneToOneTest::setupMeshConnections(std::vector<mscclpp::PortChan
|
||||
continue;
|
||||
}
|
||||
mscclpp::EndpointConfig cfg;
|
||||
if ((rankToNode(r) == rankToNode(gEnv->rank)) && useIPC) {
|
||||
if (useIPC) {
|
||||
// CudaIpc works intra-node always, and cross-node on MNNVL systems (GB200 NVL72 + IMEX)
|
||||
// via fabric handles. Tests that exercise CudaIpc across nodes on non-MNNVL hardware should
|
||||
// gate themselves with REQUIRE_CUDA_IPC_AVAILABLE; we always request CudaIpc here when asked.
|
||||
cfg.transport = mscclpp::Transport::CudaIpc;
|
||||
} else if (useIb) {
|
||||
cfg.transport = ibTransport;
|
||||
@@ -262,6 +277,7 @@ void PortChannelOneToOneTest::testPingPongPerf(PingPongTestParams params) {
|
||||
}
|
||||
|
||||
TEST(PortChannelOneToOneTest, PingPong) {
|
||||
REQUIRE_CUDA_IPC_AVAILABLE;
|
||||
testPingPong(PingPongTestParams{
|
||||
.useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Default});
|
||||
}
|
||||
@@ -279,6 +295,7 @@ TEST(PortChannelOneToOneTest, PingPongEthernet) {
|
||||
}
|
||||
|
||||
TEST(PortChannelOneToOneTest, PingPongWithPoll) {
|
||||
REQUIRE_CUDA_IPC_AVAILABLE;
|
||||
testPingPong(PingPongTestParams{
|
||||
.useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = true, .ibMode = IbMode::Default});
|
||||
}
|
||||
@@ -291,6 +308,7 @@ TEST(PortChannelOneToOneTest, PingPongIbHostModeWithPoll) {
|
||||
}
|
||||
|
||||
PERF_TEST(PortChannelOneToOneTest, PingPongPerf) {
|
||||
REQUIRE_CUDA_IPC_AVAILABLE;
|
||||
testPingPongPerf(PingPongTestParams{
|
||||
.useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Default});
|
||||
}
|
||||
@@ -482,7 +500,10 @@ void PortChannelOneToOneTest::testPacketPingPongPerf(bool useIb, IbMode ibMode)
|
||||
proxyService->stopProxy();
|
||||
}
|
||||
|
||||
TEST(PortChannelOneToOneTest, PacketPingPong) { testPacketPingPong(false, IbMode::Default); }
|
||||
TEST(PortChannelOneToOneTest, PacketPingPong) {
|
||||
REQUIRE_CUDA_IPC_AVAILABLE;
|
||||
testPacketPingPong(false, IbMode::Default);
|
||||
}
|
||||
|
||||
TEST(PortChannelOneToOneTest, PacketPingPongIbHostMode) {
|
||||
REQUIRE_IBVERBS;
|
||||
@@ -490,7 +511,10 @@ TEST(PortChannelOneToOneTest, PacketPingPongIbHostMode) {
|
||||
testPacketPingPong(true, IbMode::Host);
|
||||
}
|
||||
|
||||
PERF_TEST(PortChannelOneToOneTest, PacketPingPongPerf) { testPacketPingPongPerf(false, IbMode::Default); }
|
||||
PERF_TEST(PortChannelOneToOneTest, PacketPingPongPerf) {
|
||||
REQUIRE_CUDA_IPC_AVAILABLE;
|
||||
testPacketPingPongPerf(false, IbMode::Default);
|
||||
}
|
||||
|
||||
PERF_TEST(PortChannelOneToOneTest, PacketPingPongPerfIbHostMode) {
|
||||
REQUIRE_IBVERBS;
|
||||
@@ -583,6 +607,7 @@ void PortChannelOneToOneTest::testBandwidth(PingPongTestParams params) {
|
||||
}
|
||||
|
||||
PERF_TEST(PortChannelOneToOneTest, Bandwidth) {
|
||||
REQUIRE_CUDA_IPC_AVAILABLE;
|
||||
testBandwidth(PingPongTestParams{
|
||||
.useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Default});
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user