From 41bf96abc2d1640f4d7c4a704081acc7362672c4 Mon Sep 17 00:00:00 2001 From: Qinghua Zhou Date: Tue, 3 Feb 2026 05:16:11 +0800 Subject: [PATCH 01/52] Fix the relative path extraction on github page (#739) Fix missing 'mscclpp' base directory during version switching on GitHub Pages. --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Binyang Li --- docs/_static/version-selector.js | 62 ++++++++++++++++++++++---------- 1 file changed, 44 insertions(+), 18 deletions(-) diff --git a/docs/_static/version-selector.js b/docs/_static/version-selector.js index 0efc47fe..7622aefd 100644 --- a/docs/_static/version-selector.js +++ b/docs/_static/version-selector.js @@ -26,27 +26,53 @@ * @returns {string} The base path (e.g., '/mscclpp' or '') */ function detectBasePath() { - const path = window.location.pathname; - // Match pattern: /base-path/vX.Y.Z/... or /base-path/main/... - // The base path is everything before the version or main directory - const match = path.match(/^(\/[^\/]+)?(?=\/(v\d+\.\d+\.\d+|main)\/)/); - if (match && match[1]) { - return match[1]; - } - // Check if we're at a root that's actually a project site - // Look for common indicators like the repository name in the path - const projectMatch = path.match(/^(\/[^\/]+)(?=\/)/); - if (projectMatch) { - // Verify this isn't a version path at root - const potentialBase = projectMatch[1]; - if (!potentialBase.match(/^\/v\d+\.\d+\.\d+$/) && potentialBase !== '/main') { - // Check if the remaining path contains version info - const remainingPath = path.substring(potentialBase.length); - if (remainingPath.match(/^\/(v\d+\.\d+\.\d+|main)\//)) { - return potentialBase; + // Most reliable method: detect from this script's own URL + // The script is always at {base}/_static/version-selector.js or {base}/vX.Y.Z/_static/version-selector.js + const scripts = document.getElementsByTagName('script'); + for (let i = 0; i < scripts.length; i++) { + const src = scripts[i].src; + if (src && (src.includes('/_static/version-selector.js') || src.endsWith('version-selector.js'))) { + try { + const url = new URL(src); + const scriptPath = url.pathname; + // Extract base path: everything before /_static/version-selector.js + // But also strip version directories like /v0.8.0/ or /main/ + const match = scriptPath.match(/^(.*?)\/_static\/version-selector\.js$/); + if (match) { + let basePath = match[1] || ''; + // Remove version suffix if present (e.g., /mscclpp/v0.8.0 -> /mscclpp) + basePath = basePath.replace(/\/(v\d+\.\d+\.\d+|main)$/, ''); + return basePath; + } + } catch (e) { + // URL parsing failed, continue to fallback + // Log a warning to aid debugging when the primary detection method fails. + if (typeof console !== 'undefined' && typeof console.warn === 'function') { + console.warn('version-selector: Failed to parse script URL for base path detection; falling back to location-based detection.', src, e); + } } } } + + // Fallback: try to detect from URL path + const path = window.location.pathname; + const segments = path.split('/').filter(s => s.length > 0); + + if (segments.length >= 1) { + const firstSegment = segments[0]; + // If first segment is not a version tag (vX.Y.Z), not 'main', and + // does not look like a file name (no '.' in the segment), then it's + // the GitHub Pages project base path (e.g., 'mscclpp'). + // This handles both: + // /mscclpp/v0.8.0/index.html -> base is /mscclpp + // /mscclpp/index.html -> base is /mscclpp + // while avoiding treating root files like /index.html as a base path. + if (!firstSegment.match(/^v\d+\.\d+\.\d+$/) && firstSegment !== 'main' && !firstSegment.includes('.')) { + return '/' + firstSegment; + } + } + + // No base path (root site or local development) return ''; } From 03b1936ddb5d56275b6257164a2c22a40b399c0a Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Tue, 3 Feb 2026 08:50:45 +0900 Subject: [PATCH 02/52] Support multi-node in `MemoryChannel` tutorial (#726) Co-authored-by: mahdiehghazim Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- docs/tutorials/03-memory-channel.md | 33 +++++++- .../03-memory-channel/bidir_memory_channel.cu | 75 ++++++++++++------- 2 files changed, 81 insertions(+), 27 deletions(-) diff --git a/docs/tutorials/03-memory-channel.md b/docs/tutorials/03-memory-channel.md index 00e2192b..c6a8b9e1 100644 --- a/docs/tutorials/03-memory-channel.md +++ b/docs/tutorials/03-memory-channel.md @@ -78,7 +78,7 @@ mscclpp::GpuBuffer buffer(bufferBytes); mscclpp::RegisteredMemory localRegMem = comm.registerMemory(buffer.data(), buffer.bytes(), transport); ``` -Here, we first allocate GPU device memory using `mscclpp::GpuBuffer` and then register its memory region with the `registerMemory()` method of the `Communicator`. If you are using the `Context` interface as shown in the [Basic Concepts](./01-basic-concepts.md) tutorial, you can use `context.registerMemory()` instead. The `transport` parameter specifies the transport types that this memory region can be accessed with. In this example, we use only `mscclpp::Transport::CudaIpc`, which allows the memory to be accessed by other processes using CUDA/HIP IPC. The `CudaIpc` transport type is typically used for intra-node communication, but with certain hardware configurations, it can also be used for inter-node communication (such as [NVL72](https://www.nvidia.com/en-us/data-center/gb300-nvl72) on NVIDIA Grace Blackwell platforms). We will introduce other transport types in later tutorials. +Here, we first allocate GPU device memory using `mscclpp::GpuBuffer` and then register its memory region with the `registerMemory()` method of the `Communicator`. If you are using the `Context` interface as shown in the [Basic Concepts](./01-basic-concepts.md) tutorial, you can use `context.registerMemory()` instead. The `transport` parameter specifies the transport types that this memory region can be accessed with. In this example, we use only `mscclpp::Transport::CudaIpc`, which allows the memory to be accessed by other processes using CUDA/HIP IPC. The `CudaIpc` transport type is typically used for intra-node communication, but with certain hardware configurations, it can also be used for inter-node communication (will be explained in a later section: {ref}`mc-cross-node`). We will introduce other transport types in later tutorials. **GpuBuffer** is NOT required for creating a `RegisteredMemory`; you can register any pre-allocated GPU memory region with `registerMemory()`. However, it is the user's responsibility to ensure that the memory region is suitable for their communication operations. Depending on the hardware platform, some communication methods may require specific memory allocation to ensure data consistency and correctness. `GpuBuffer` is a convenient way to allocate GPU memory that is compatible with the communication methods that MSCCL++ supports. It provides a simple interface for allocating GPU memory and automatically handles memory deallocation when it goes out of scope. @@ -251,6 +251,37 @@ columns 2 Since the flags take 50% of the packet size, the goodput of communication using packets is only 50% compared to transferring raw data. However, this doesn't matter because packets are designed for small data transfers. Packets transfer small data efficiently because the integrity of the user data is guaranteed by only waiting for the correct flags (done by `unpackPackets()`); explicit memory synchronization (signal and wait) is not needed. +(mc-cross-node)= +## Cross-node Execution + +For **inter-node** communication, using `PortChannel` (will be explained in the following tutorial) is usually a more accessible option that leverages more widely-used networking interfaces. However, `MemoryChannel` can still be used as long as the underlying hardware allows memory mapping between the two GPUs, such as [Multi-Node NVLink (MNNVL)](https://docs.nvidia.com/multi-node-nvlink-systems/mnnvl-user-guide/overview.html) on NVIDIA Grace Blackwell platforms. + +We can use the same example code to test inter-node `MemoryChannel`. Users can consult the [NVIDIA MNNVL verification guide](https://docs.nvidia.com/multi-node-nvlink-systems/mnnvl-user-guide/verifying.html) for verification steps and detailed environment requirements for MNNVL. + +Run the program on two nodes with command line arguments: + +``` +./bidir_memory_channel [ ] +``` + +For example, assume we use `192.168.0.1:50000` as the bootstrap IP address and port, and both nodes use GPU 0 locally. + +On Node 0 (Rank 0): +```bash +$ ./bidir_memory_channel 192.168.0.1:50000 0 0 +``` + +On Node 1 (Rank 1): +```bash +$ ./bidir_memory_channel 192.168.0.1:50000 1 0 +``` + +You should see output indicating successful data transfer. + +```{tip} +If your bootstrap IP address is not on the default network interface of your node, you can specify the network interface by passing `interface_name:ip:port` as the first argument (such as `eth1:192.168.0.1:50000`). +``` + ## Summary and Next Steps In this tutorial, you have learned how to use `MemoryChannel` for efficient data transfer between GPUs. You have also learned how to create communication buffers using `RegisteredMemory` and `GpuBuffer`, and how to use packets for small data transfers. You can find more complex usage of `MemoryChannel` in the {ref}`mscclpp-test`. diff --git a/examples/tutorials/03-memory-channel/bidir_memory_channel.cu b/examples/tutorials/03-memory-channel/bidir_memory_channel.cu index e9007612..cfbf12d7 100644 --- a/examples/tutorials/03-memory-channel/bidir_memory_channel.cu +++ b/examples/tutorials/03-memory-channel/bidir_memory_channel.cu @@ -95,9 +95,8 @@ __global__ void bidirPutPacketKernel(mscclpp::MemoryChannelDeviceHandle *devHand devHandle->unpackPackets(pktBufOffset, dstOffset, copyBytes, tid, blockDim.x * gridDim.x, flag); } -void worker(int gpuId) { +void worker(int myRank, int gpuId, const std::string &ipPort) { MSCCLPP_CUDATHROW(cudaSetDevice(gpuId)); - const int myRank = gpuId; const int remoteRank = myRank == 0 ? 1 : 0; const int nRanks = 2; const int iter = 1000; @@ -105,11 +104,11 @@ void worker(int gpuId) { const size_t bufferBytes = 256 * 1024 * 1024; const size_t pktBufferBytes = 256 * 1024 * 1024; - log("GPU ", gpuId, ": Preparing for tests ..."); + log("Rank ", myRank, " (GPU ", gpuId, "): Preparing for tests ..."); // Build a connection and a semaphore auto bootstrap = std::make_shared(myRank, nRanks); - bootstrap->initialize("lo:127.0.0.1:" PORT_NUMBER); + bootstrap->initialize(ipPort); mscclpp::Communicator comm(bootstrap); auto conn = comm.connect({transport, {mscclpp::DeviceType::GPU, gpuId}}, remoteRank).get(); auto sema = comm.buildSemaphore(conn, remoteRank).get(); @@ -162,7 +161,7 @@ void worker(int gpuId) { }; cudaEvent_t start, end; - if (gpuId == 0) { + if (myRank == 0) { MSCCLPP_CUDATHROW(cudaEventCreate(&start)); MSCCLPP_CUDATHROW(cudaEventCreate(&end)); } @@ -189,13 +188,13 @@ void worker(int gpuId) { MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); bootstrap->barrier(); - if (gpuId == 0) { + if (myRank == 0) { MSCCLPP_CUDATHROW(cudaEventRecord(start, stream)); } MSCCLPP_CUDATHROW(cudaGraphLaunch(graphExec, stream)); - if (gpuId == 0) { + if (myRank == 0) { MSCCLPP_CUDATHROW(cudaEventRecord(end, stream)); MSCCLPP_CUDATHROW(cudaEventSynchronize(end)); float elapsedTime; @@ -204,8 +203,8 @@ void worker(int gpuId) { MSCCLPP_CUDATHROW(cudaEventElapsedTime(&elapsedTime, start, end)); elapsedTimePerIter = elapsedTime / iter; gbps = float(copyBytes) / elapsedTimePerIter * 1e-6f; - log("GPU ", gpuId, ": [", testName, "] bytes ", copyBytes, ", elapsed ", elapsedTimePerIter, " ms/iter, BW ", - gbps, " GB/s"); + log("Rank ", myRank, " (GPU ", gpuId, "): [", testName, "] bytes ", copyBytes, ", elapsed ", elapsedTimePerIter, + " ms/iter, BW ", gbps, " GB/s"); } MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream)); MSCCLPP_CUDATHROW(cudaGraphExecDestroy(graphExec)); @@ -216,23 +215,47 @@ void worker(int gpuId) { bootstrap->barrier(); } -int main() { - int pid0 = spawn_process([]() { worker(0); }); - int pid1 = spawn_process([]() { worker(1); }); - if (pid0 < 0 || pid1 < 0) { - log("Failed to spawn processes."); +int main(int argc, char **argv) { + if (argc == 1) { + int pid0 = spawn_process([]() { worker(0, 0, "lo:127.0.0.1:" PORT_NUMBER); }); + int pid1 = spawn_process([]() { worker(1, 1, "lo:127.0.0.1:" PORT_NUMBER); }); + if (pid0 < 0 || pid1 < 0) { + log("Failed to spawn processes."); + return -1; + } + int status0 = wait_process(pid0); + int status1 = wait_process(pid1); + if (status0 < 0 || status1 < 0) { + log("Failed to wait for processes."); + return -1; + } + if (status0 != 0 || status1 != 0) { + log("One of the processes failed."); + return -1; + } + log("Succeed!"); + return 0; + } else if (argc == 4) { + std::string ipPort = argv[1]; + int rank, gpuId; + try { + rank = std::stoi(argv[2]); + gpuId = std::stoi(argv[3]); + } catch (const std::exception &) { + log("Error: rank and gpu_id must be valid integers."); + return -1; + } + if (rank < 0 || rank > 2 || gpuId < 0) { + log("Error: rank must be between 0 and 1 and gpu_id must be non-negative."); + return -1; + } + worker(rank, gpuId, ipPort); + log("Rank ", rank, ": Succeed!"); + return 0; + } else { + std::cerr << "Usage:\n" + << " " << argv[0] << " Run in intra-node mode\n" + << " " << argv[0] << " Run in inter-node mode\n"; return -1; } - int status0 = wait_process(pid0); - int status1 = wait_process(pid1); - if (status0 < 0 || status1 < 0) { - log("Failed to wait for processes."); - return -1; - } - if (status0 != 0 || status1 != 0) { - log("One of the processes failed."); - return -1; - } - log("Succeed!"); - return 0; } From e21513791a79f62768a9f8f9b8517ebf803d2eed Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Tue, 3 Feb 2026 10:13:20 -0800 Subject: [PATCH 03/52] Address comments for PR #692 (#733) Rename nanobind-exposed C++ types to Cpp* Replace MSCCLPP_EXECUTION_PLAN_DIR / MSCCLPP_NATIVE_CACHE_DIR with MSCCLPP_CACHE_DIR across C++ and Python. --- docs/conf.py | 18 ++- docs/dsl/results.md | 2 +- docs/py_api.rst | 4 +- docs/quickstart.md | 2 +- include/mscclpp/env.hpp | 6 +- python/csrc/algorithm.cpp | 20 ++-- python/csrc/core_py.cpp | 36 +++--- python/csrc/env_py.cpp | 4 +- python/csrc/error_py.cpp | 2 +- python/csrc/executor_py.cpp | 6 +- .../ext/algorithm_collection_builder_py.cpp | 2 +- python/csrc/fifo_py.cpp | 6 +- python/csrc/gpu_utils_py.cpp | 2 +- python/csrc/memory_channel_py.cpp | 8 +- python/csrc/npkit_py.cpp | 2 +- python/csrc/numa_py.cpp | 2 +- python/csrc/port_channel_py.cpp | 12 +- python/csrc/semaphore_py.cpp | 6 +- python/csrc/switch_channel_py.cpp | 6 +- python/mscclpp/__init__.py | 52 ++++----- python/mscclpp/__main__.py | 4 +- python/mscclpp/_core/__init__.py | 6 - python/mscclpp/_core/algorithm.py | 52 +++++---- python/mscclpp/_core/buffer.py | 4 +- python/mscclpp/_core/comm.py | 110 +++++++++--------- python/mscclpp/_core/compiler.py | 19 ++- python/mscclpp/ext/__init__.py | 2 - .../ext/algorithm_collection_builder.py | 8 +- python/mscclpp/utils.py | 2 +- src/core/env.cpp | 5 +- .../algorithm_collection_builder.cc | 6 +- 31 files changed, 211 insertions(+), 205 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index fdfb8d66..52321465 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -11,6 +11,18 @@ import sys import importlib.util from pathlib import Path +from unittest.mock import MagicMock + + +class NamedMock(MagicMock): + def __getattr__(self, name): + attr = super().__getattr__(name) + if isinstance(attr, MagicMock): + # Assigns __name__ and __qualname__ to satisfy Sphinx autodoc inspection. + attr.__name__ = name + attr.__qualname__ = name + return attr + # Add the python package to sys.path so Sphinx can find it project_root = Path(__file__).parent.parent @@ -63,7 +75,7 @@ autodoc_default_options = { "show-inheritance": True, } # only mock the C-extension when using the source tree -autodoc_mock_imports = ["mscclpp._version", "mscclpp._mscclpp", "blake3", "cupy", "mpi4py", "numpy", "sortedcontainers"] +autodoc_mock_imports = ["mscclpp._version", "blake3", "cupy", "mpi4py", "numpy", "sortedcontainers"] autodoc_typehints = "description" napoleon_google_docstring = True napoleon_numpy_docstring = True @@ -71,6 +83,10 @@ intersphinx_mapping = { "python": ("https://docs.python.org/3", None), "numpy": ("https://numpy.org/doc/stable/", None), } +mock_mscclpp = NamedMock() +# Set attributes to satisfy Sphinx autodoc inspection. +mock_mscclpp.env.return_value.cache_dir = "_mscclpp" +sys.modules["mscclpp._mscclpp"] = mock_mscclpp templates_path = ["_templates"] exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] diff --git a/docs/dsl/results.md b/docs/dsl/results.md index a34eae5b..99f19476 100644 --- a/docs/dsl/results.md +++ b/docs/dsl/results.md @@ -56,7 +56,7 @@ python3 -m mscclpp --install After installation, the generated JSON execution plan can be found at: ``` -~/.cache/mscclpp_default/ +~/.cache/mscclpp/default/ ``` **Performance Results:** diff --git a/docs/py_api.rst b/docs/py_api.rst index 5ea39bc3..7acc9273 100644 --- a/docs/py_api.rst +++ b/docs/py_api.rst @@ -7,6 +7,4 @@ This reference organizes the MSCCL++ Python API. :toctree: py_api :recursive: - mscclpp.comm - mscclpp.utils - mscclpp.language + mscclpp diff --git a/docs/quickstart.md b/docs/quickstart.md index 04a26466..ac1b7d6b 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -196,7 +196,7 @@ mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib Example 2, ReduceScatter will still use msccl++ implementation since reducescatter is not in the fallbacklist. ```bash export LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH; -mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib/libmscclpp_nccl.so -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=$NCCL_BUILD/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" -x MSCCLPP_EXECUTION_PLAN_DIR=/$PATH_TO_EXECUTION_PLANS/execution-files ./build/reduce_scatter_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50 +mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$MSCCLPP_BUILD/lib/libmscclpp_nccl.so -x MSCCLPP_ENABLE_NCCL_FALLBACK=TRUE -x MSCCLPP_NCCL_LIB_PATH=$NCCL_BUILD/lib/libnccl.so -x MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION="broadcast" ./build/reduce_scatter_perf -b 1K -e 256M -f 2 -d half -G 20 -w 10 -n 50 ``` On AMD platforms, you need to add `RCCL_MSCCL_ENABLE=0` to avoid conflicts with the fallback features. diff --git a/include/mscclpp/env.hpp b/include/mscclpp/env.hpp index 5972234b..9d78cd1a 100644 --- a/include/mscclpp/env.hpp +++ b/include/mscclpp/env.hpp @@ -70,9 +70,9 @@ class Env { /// Env name: `MSCCLPP_COMM_ID`. To be deprecated; don't use this. const std::string commId; - /// Env name: `MSCCLPP_EXECUTION_PLAN_DIR`. The directory to find execution plans from. This should be set to - /// use execution plans for the NCCL API. Unset by default. - const std::string executionPlanDir; + /// Env name: `MSCCLPP_CACHE_DIR`. The directory to use for caching execution plans and other temporary files. + /// If unset, it defaults to `~/.cache/mscclpp`. + const std::string cacheDir; /// Env name: `MSCCLPP_NPKIT_DUMP_DIR`. The directory to dump NPKIT traces to. If this is set, NPKIT will be /// enabled and will dump traces to this directory. Unset by default. diff --git a/python/csrc/algorithm.cpp b/python/csrc/algorithm.cpp index 28edfe2d..3553256a 100644 --- a/python/csrc/algorithm.cpp +++ b/python/csrc/algorithm.cpp @@ -16,14 +16,16 @@ namespace nb = nanobind; using namespace mscclpp; void register_algorithm(nb::module_& m) { - nb::enum_(m, "CollectiveBufferMode") + nb::enum_(m, "CppCollectiveBufferMode") .value("ANY", CollectiveBufferMode::Any) .value("IN_PLACE", CollectiveBufferMode::InPlace) .value("OUT_OF_PLACE", CollectiveBufferMode::OutOfPlace); - nb::enum_(m, "AlgorithmType").value("NATIVE", AlgorithmType::Native).value("DSL", AlgorithmType::DSL); + nb::enum_(m, "CppAlgorithmType") + .value("NATIVE", AlgorithmType::Native) + .value("DSL", AlgorithmType::DSL); - nb::enum_(m, "CommResult") + nb::enum_(m, "CppCommResult") .value("COMM_SUCCESS", CommResult::CommSuccess) .value("COMM_UNHANDLED_CUDA_ERROR", CommResult::CommUnhandledCudaError) .value("COMM_SYSTEM_ERROR", CommResult::CommSystemError) @@ -34,13 +36,13 @@ void register_algorithm(nb::module_& m) { .value("COMM_IN_PROGRESS", CommResult::CommInProgress) .value("COMM_NUM_RESULTS", CommResult::CommNumResults); - nb::enum_(m, "ReduceOp") + nb::enum_(m, "CppReduceOp") .value("SUM", ReduceOp::SUM) .value("MIN", ReduceOp::MIN) .value("NOP", ReduceOp::NOP); auto algorithmClass = - nb::class_(m, "Algorithm") + nb::class_(m, "CppAlgorithm") .def_static( "from_native_capsule", [](nb::capsule cap) { @@ -83,21 +85,21 @@ void register_algorithm(nb::module_& m) { .def_rw("world_size", &Algorithm::Constraint::worldSize) .def_rw("n_ranks_per_node", &Algorithm::Constraint::nRanksPerNode); - nb::class_(m, "AlgorithmBuilder").def("build", &AlgorithmBuilder::build); + nb::class_(m, "CppAlgorithmBuilder").def("build", &AlgorithmBuilder::build); - nb::class_(m, "DslAlgorithm") + nb::class_(m, "CppDslAlgorithm") .def(nb::init, Algorithm::Constraint>(), nb::arg("id"), nb::arg("plan"), nb::arg("tags") = std::unordered_map(), nb::arg("constraint") = Algorithm::Constraint()) .def("build", &DslAlgorithm::build); - nb::class_(m, "AlgorithmCollection") + nb::class_(m, "CppAlgorithmCollection") .def("register_algorithm", &AlgorithmCollection::registerAlgorithm, nb::arg("collective"), nb::arg("algo_name"), nb::arg("algorithm")) .def("get_algorithms_by_collective", &AlgorithmCollection::getAlgorithmsByCollective, nb::arg("collective")) .def("to_list", &AlgorithmCollection::getAllAlgorithms); - nb::class_(m, "CollectiveRequest") + nb::class_(m, "CppCollectiveRequest") .def_ro("world_size", &CollectiveRequest::worldSize) .def_ro("n_ranks_per_node", &CollectiveRequest::nRanksPerNode) .def_ro("rank", &CollectiveRequest::rank) diff --git a/python/csrc/core_py.cpp b/python/csrc/core_py.cpp index ba6af1dd..9f085675 100644 --- a/python/csrc/core_py.cpp +++ b/python/csrc/core_py.cpp @@ -32,21 +32,21 @@ extern void register_algorithm_collection_builder(nb::module_& m); template void def_shared_future(nb::handle& m, const std::string& typestr) { - std::string pyclass_name = std::string("shared_future_") + typestr; + std::string pyclass_name = std::string("CppSharedFuture_") + typestr; nb::class_>(m, pyclass_name.c_str()).def("get", &std::shared_future::get); } void register_core(nb::module_& m) { m.def("version", &version); - nb::enum_(m, "DataType") + nb::enum_(m, "CppDataType") .value("int32", DataType::INT32) .value("uint32", DataType::UINT32) .value("float16", DataType::FLOAT16) .value("float32", DataType::FLOAT32) .value("bfloat16", DataType::BFLOAT16); - nb::class_(m, "Bootstrap") + nb::class_(m, "CppBootstrap") .def("get_rank", &Bootstrap::getRank) .def("get_n_ranks", &Bootstrap::getNranks) .def("get_n_ranks_per_node", &Bootstrap::getNranksPerNode) @@ -71,7 +71,7 @@ void register_core(nb::module_& m) { .def("recv", static_cast&, int, int)>(&Bootstrap::recv), nb::arg("data"), nb::arg("peer"), nb::arg("tag")); - nb::class_(m, "UniqueId") + nb::class_(m, "CppUniqueId") .def(nb::init<>()) .def("__setstate__", [](UniqueId& self, nb::bytes b) { @@ -81,7 +81,7 @@ void register_core(nb::module_& m) { .def("__getstate__", [](const UniqueId& self) { return nb::bytes(reinterpret_cast(self.data()), UniqueIdBytes); }); - nb::class_(m, "TcpBootstrap") + nb::class_(m, "CppTcpBootstrap") .def(nb::init(), "Do not use this constructor. Use create instead.") .def_static( "create", [](int rank, int nRanks) { return std::make_shared(rank, nRanks); }, nb::arg("rank"), @@ -93,7 +93,7 @@ void register_core(nb::module_& m) { .def("initialize", static_cast(&TcpBootstrap::initialize), nb::call_guard(), nb::arg("if_ip_port_trio"), nb::arg("timeout_sec") = 30); - nb::enum_(m, "Transport") + nb::enum_(m, "CppTransport") .value("Unknown", Transport::Unknown) .value("CudaIpc", Transport::CudaIpc) .value("IB0", Transport::IB0) @@ -106,7 +106,7 @@ void register_core(nb::module_& m) { .value("IB7", Transport::IB7) .value("NumTransports", Transport::NumTransports); - nb::class_(m, "TransportFlags") + nb::class_(m, "CppTransportFlags") .def(nb::init<>()) .def(nb::init_implicit(), nb::arg("transport")) .def("has", &TransportFlags::has, nb::arg("transport")) @@ -130,12 +130,12 @@ void register_core(nb::module_& m) { .def(nb::self == nb::self) .def(nb::self != nb::self); - nb::enum_(m, "DeviceType") + nb::enum_(m, "CppDeviceType") .value("Unknown", DeviceType::Unknown) .value("CPU", DeviceType::CPU) .value("GPU", DeviceType::GPU); - nb::class_(m, "Device") + nb::class_(m, "CppDevice") .def(nb::init<>()) .def(nb::init_implicit(), nb::arg("type")) .def(nb::init(), nb::arg("type"), nb::arg("id") = -1) @@ -147,7 +147,7 @@ void register_core(nb::module_& m) { return ss.str(); }); - nb::class_(m, "EndpointConfigIb") + nb::class_(m, "CppEndpointConfigIb") .def(nb::init<>()) .def(nb::init(), nb::arg("device_index") = -1, nb::arg("port") = EndpointConfig::Ib::DefaultPort, @@ -164,7 +164,7 @@ void register_core(nb::module_& m) { .def_rw("max_send_wr", &EndpointConfig::Ib::maxSendWr) .def_rw("max_wr_per_send", &EndpointConfig::Ib::maxWrPerSend); - nb::class_(m, "RegisteredMemory") + nb::class_(m, "CppRegisteredMemory") .def(nb::init<>()) .def("data", [](RegisteredMemory& self) { return reinterpret_cast(self.data()); }) .def("size", &RegisteredMemory::size) @@ -172,7 +172,7 @@ void register_core(nb::module_& m) { .def("serialize", &RegisteredMemory::serialize) .def_static("deserialize", &RegisteredMemory::deserialize, nb::arg("data")); - nb::class_(m, "Endpoint") + nb::class_(m, "CppEndpoint") .def("config", &Endpoint::config) .def("transport", &Endpoint::transport) .def("device", &Endpoint::device) @@ -180,7 +180,7 @@ void register_core(nb::module_& m) { .def("serialize", &Endpoint::serialize) .def_static("deserialize", &Endpoint::deserialize, nb::arg("data")); - nb::class_(m, "Connection") + nb::class_(m, "CppConnection") .def("write", &Connection::write, nb::arg("dst"), nb::arg("dstOffset"), nb::arg("src"), nb::arg("srcOffset"), nb::arg("size")) .def( @@ -197,7 +197,7 @@ void register_core(nb::module_& m) { .def("local_device", &Connection::localDevice) .def("get_max_write_queue_size", &Connection::getMaxWriteQueueSize); - nb::class_(m, "EndpointConfig") + nb::class_(m, "CppEndpointConfig") .def(nb::init<>()) .def(nb::init_implicit(), nb::arg("transport")) .def(nb::init(), nb::arg("transport"), nb::arg("device"), @@ -228,7 +228,7 @@ void register_core(nb::module_& m) { [](EndpointConfig& self, int v) { self.ib.maxWrPerSend = v; }) .def_rw("max_write_queue_size", &EndpointConfig::maxWriteQueueSize); - nb::class_(m, "Context") + nb::class_(m, "CppContext") .def_static("create", &Context::create) .def( "register_memory", @@ -239,13 +239,13 @@ void register_core(nb::module_& m) { .def("create_endpoint", &Context::createEndpoint, nb::arg("config")) .def("connect", &Context::connect, nb::arg("local_endpoint"), nb::arg("remote_endpoint")); - nb::class_(m, "SemaphoreStub") + nb::class_(m, "CppSemaphoreStub") .def(nb::init(), nb::arg("connection")) .def("memory", &SemaphoreStub::memory) .def("serialize", &SemaphoreStub::serialize) .def_static("deserialize", &SemaphoreStub::deserialize, nb::arg("data")); - nb::class_(m, "Semaphore") + nb::class_(m, "CppSemaphore") .def(nb::init<>()) .def(nb::init(), nb::arg("local_stub"), nb::arg("remote_stub")) .def("connection", &Semaphore::connection) @@ -256,7 +256,7 @@ void register_core(nb::module_& m) { def_shared_future(m, "Connection"); def_shared_future(m, "Semaphore"); - nb::class_(m, "Communicator") + nb::class_(m, "CppCommunicator") .def(nb::init, std::shared_ptr>(), nb::arg("bootstrap"), nb::arg("context") = nullptr) .def("bootstrap", &Communicator::bootstrap) diff --git a/python/csrc/env_py.cpp b/python/csrc/env_py.cpp index a0ba4a4e..360acc6f 100644 --- a/python/csrc/env_py.cpp +++ b/python/csrc/env_py.cpp @@ -11,7 +11,7 @@ namespace nb = nanobind; using namespace mscclpp; void register_env(nb::module_& m) { - nb::class_(m, "Env") + nb::class_(m, "CppEnv") .def_ro("debug", &Env::debug) .def_ro("debug_subsys", &Env::debugSubsys) .def_ro("debug_file", &Env::debugFile) @@ -20,7 +20,7 @@ void register_env(nb::module_& m) { .def_ro("socket_family", &Env::socketFamily) .def_ro("socket_ifname", &Env::socketIfname) .def_ro("comm_id", &Env::commId) - .def_ro("execution_plan_dir", &Env::executionPlanDir) + .def_ro("cache_dir", &Env::cacheDir) .def_ro("npkit_dump_dir", &Env::npkitDumpDir) .def_ro("cuda_ipc_use_default_stream", &Env::cudaIpcUseDefaultStream); diff --git a/python/csrc/error_py.cpp b/python/csrc/error_py.cpp index ff532d10..208f4e84 100644 --- a/python/csrc/error_py.cpp +++ b/python/csrc/error_py.cpp @@ -22,7 +22,7 @@ using namespace mscclpp; m.attr(#name_).ptr()); void register_error(nb::module_ &m) { - nb::enum_(m, "ErrorCode") + nb::enum_(m, "CppErrorCode") .value("SystemError", ErrorCode::SystemError) .value("InternalError", ErrorCode::InternalError) .value("RemoteError", ErrorCode::RemoteError) diff --git a/python/csrc/executor_py.cpp b/python/csrc/executor_py.cpp index 0a196f37..350a1e7a 100644 --- a/python/csrc/executor_py.cpp +++ b/python/csrc/executor_py.cpp @@ -15,16 +15,16 @@ namespace nb = nanobind; using namespace mscclpp; void register_executor(nb::module_& m) { - nb::enum_(m, "PacketType").value("LL8", PacketType::LL8).value("LL16", PacketType::LL16); + nb::enum_(m, "CppPacketType").value("LL8", PacketType::LL8).value("LL16", PacketType::LL16); - nb::class_(m, "ExecutionPlan") + nb::class_(m, "CppExecutionPlan") .def(nb::init(), nb::arg("planPath"), nb::arg("rank")) .def_prop_ro("name", [](const ExecutionPlan& self) -> std::string { return self.name(); }) .def_prop_ro("collective", [](const ExecutionPlan& self) -> std::string { return self.collective(); }) .def_prop_ro("min_message_size", [](const ExecutionPlan& self) -> size_t { return self.minMessageSize(); }) .def_prop_ro("max_message_size", [](const ExecutionPlan& self) -> size_t { return self.maxMessageSize(); }); - nb::class_(m, "Executor") + nb::class_(m, "CppExecutor") .def(nb::init>(), nb::arg("comm")) .def( "execute", diff --git a/python/csrc/ext/algorithm_collection_builder_py.cpp b/python/csrc/ext/algorithm_collection_builder_py.cpp index 2756edb7..1a912724 100644 --- a/python/csrc/ext/algorithm_collection_builder_py.cpp +++ b/python/csrc/ext/algorithm_collection_builder_py.cpp @@ -15,7 +15,7 @@ using namespace mscclpp; using namespace mscclpp::collective; void register_algorithm_collection_builder(nb::module_& m) { - nb::class_(m, "AlgorithmCollectionBuilder") + nb::class_(m, "CppAlgorithmCollectionBuilder") .def_static("get_instance", &AlgorithmCollectionBuilder::getInstance) .def("add_algorithm_builder", &AlgorithmCollectionBuilder::addAlgorithmBuilder, nb::arg("builder")) .def( diff --git a/python/csrc/fifo_py.cpp b/python/csrc/fifo_py.cpp index 63be4a33..e8b6a3e2 100644 --- a/python/csrc/fifo_py.cpp +++ b/python/csrc/fifo_py.cpp @@ -9,7 +9,7 @@ namespace nb = nanobind; using namespace mscclpp; void register_fifo(nb::module_& m) { - nb::class_(m, "ProxyTrigger") + nb::class_(m, "CppProxyTrigger") .def_prop_rw( "fst", [](const ProxyTrigger& self) { return self.fst; }, [](ProxyTrigger& self, uint64_t v) { self.fst = v; }) @@ -17,7 +17,7 @@ void register_fifo(nb::module_& m) { "snd", [](const ProxyTrigger& self) { return self.snd; }, [](ProxyTrigger& self, uint64_t v) { self.snd = v; }); - nb::class_(m, "FifoDeviceHandle") + nb::class_(m, "CppFifoDeviceHandle") .def_rw("triggers", &FifoDeviceHandle::triggers) .def_rw("tail", &FifoDeviceHandle::tail) .def_rw("head", &FifoDeviceHandle::head) @@ -26,7 +26,7 @@ void register_fifo(nb::module_& m) { return nb::bytes(reinterpret_cast(&self), sizeof(self)); }); - nb::class_(m, "Fifo") + nb::class_(m, "CppFifo") .def(nb::init(), nb::arg("size") = DEFAULT_FIFO_SIZE) .def("poll", &Fifo::poll) .def("pop", &Fifo::pop) diff --git a/python/csrc/gpu_utils_py.cpp b/python/csrc/gpu_utils_py.cpp index 66f036e2..6995756b 100644 --- a/python/csrc/gpu_utils_py.cpp +++ b/python/csrc/gpu_utils_py.cpp @@ -101,7 +101,7 @@ static nb::capsule toDlpack(GpuBuffer buffer, std::string dataType, std::v void register_gpu_utils(nb::module_& m) { m.def("is_nvls_supported", &isNvlsSupported); - nb::class_>(m, "RawGpuBuffer") + nb::class_>(m, "CppRawGpuBuffer") .def(nb::init(), nb::arg("nelems")) .def("nelems", &GpuBuffer::nelems) .def("bytes", &GpuBuffer::bytes) diff --git a/python/csrc/memory_channel_py.cpp b/python/csrc/memory_channel_py.cpp index 4f9d90a0..ecccb1a0 100644 --- a/python/csrc/memory_channel_py.cpp +++ b/python/csrc/memory_channel_py.cpp @@ -11,20 +11,20 @@ namespace nb = nanobind; using namespace mscclpp; void register_memory_channel(nb::module_& m) { - nb::class_(m, "BaseMemoryChannel") + nb::class_(m, "CppBaseMemoryChannel") .def(nb::init<>()) .def(nb::init>(), nb::arg("semaphore")) .def(nb::init(), nb::arg("semaphore")) .def("device_handle", &BaseMemoryChannel::deviceHandle); - nb::class_(m, "BaseMemoryChannelDeviceHandle") + nb::class_(m, "CppBaseMemoryChannelDeviceHandle") .def(nb::init<>()) .def_rw("semaphore_", &BaseMemoryChannel::DeviceHandle::semaphore_) .def_prop_ro("raw", [](const BaseMemoryChannel::DeviceHandle& self) -> nb::bytes { return nb::bytes(reinterpret_cast(&self), sizeof(self)); }); - nb::class_(m, "MemoryChannel") + nb::class_(m, "CppMemoryChannel") .def(nb::init<>()) .def( "__init__", @@ -42,7 +42,7 @@ void register_memory_channel(nb::module_& m) { nb::arg("semaphore"), nb::arg("dst"), nb::arg("src"), nb::arg("packet_buffer") = 0) .def("device_handle", &MemoryChannel::deviceHandle); - nb::class_(m, "MemoryChannelDeviceHandle") + nb::class_(m, "CppMemoryChannelDeviceHandle") .def(nb::init<>()) .def_rw("semaphore_", &MemoryChannel::DeviceHandle::semaphore_) .def_rw("dst_", &MemoryChannel::DeviceHandle::dst_) diff --git a/python/csrc/npkit_py.cpp b/python/csrc/npkit_py.cpp index 0557b72d..8aaa8011 100644 --- a/python/csrc/npkit_py.cpp +++ b/python/csrc/npkit_py.cpp @@ -9,7 +9,7 @@ namespace nb = nanobind; void register_npkit(nb::module_ &m) { - nb::module_ sub_m = m.def_submodule("npkit", "NPKit functions"); + nb::module_ sub_m = m.def_submodule("cpp_npkit", "NPKit functions"); sub_m.def("init", &NpKit::Init); sub_m.def("dump", &NpKit::Dump); sub_m.def("shutdown", &NpKit::Shutdown); diff --git a/python/csrc/numa_py.cpp b/python/csrc/numa_py.cpp index 2489a479..4433ecc8 100644 --- a/python/csrc/numa_py.cpp +++ b/python/csrc/numa_py.cpp @@ -7,7 +7,7 @@ void numaBind(int node); }; // namespace mscclpp void register_numa(nb::module_ &m) { - nb::module_ sub_m = m.def_submodule("numa", "numa functions"); + nb::module_ sub_m = m.def_submodule("cpp_numa", "numa functions"); sub_m.def("get_device_numa_node", &mscclpp::getDeviceNumaNode); sub_m.def("numa_bind", &mscclpp::numaBind); } diff --git a/python/csrc/port_channel_py.cpp b/python/csrc/port_channel_py.cpp index 4b1aa289..e3dd98f1 100644 --- a/python/csrc/port_channel_py.cpp +++ b/python/csrc/port_channel_py.cpp @@ -11,11 +11,11 @@ namespace nb = nanobind; using namespace mscclpp; void register_port_channel(nb::module_& m) { - nb::class_(m, "BaseProxyService") + nb::class_(m, "CppBaseProxyService") .def("start_proxy", &BaseProxyService::startProxy, nb::arg("blocking") = false) .def("stop_proxy", &BaseProxyService::stopProxy); - nb::class_(m, "ProxyService") + nb::class_(m, "CppProxyService") .def(nb::init(), nb::arg("fifo_size") = DEFAULT_FIFO_SIZE) .def("start_proxy", &ProxyService::startProxy, nb::arg("blocking") = false) .def("stop_proxy", &ProxyService::stopProxy) @@ -31,13 +31,13 @@ void register_port_channel(nb::module_& m) { .def("base_port_channel", &ProxyService::basePortChannel, nb::arg("id")) .def("port_channel", &ProxyService::portChannel, nb::arg("id"), nb::arg("dst"), nb::arg("src")); - nb::class_(m, "BasePortChannel") + nb::class_(m, "CppBasePortChannel") .def(nb::init<>()) .def(nb::init, std::shared_ptr>(), nb::arg("semaphore_id"), nb::arg("semaphore"), nb::arg("proxy")) .def("device_handle", &BasePortChannel::deviceHandle); - nb::class_(m, "BasePortChannelDeviceHandle") + nb::class_(m, "CppBasePortChannelDeviceHandle") .def(nb::init<>()) .def_rw("semaphore_id_", &BasePortChannel::DeviceHandle::semaphoreId_) .def_rw("semaphore_", &BasePortChannel::DeviceHandle::semaphore_) @@ -46,13 +46,13 @@ void register_port_channel(nb::module_& m) { return nb::bytes(reinterpret_cast(&self), sizeof(self)); }); - nb::class_(m, "PortChannel") + nb::class_(m, "CppPortChannel") .def(nb::init<>()) .def(nb::init, std::shared_ptr, MemoryId, MemoryId>(), nb::arg("semaphore_id"), nb::arg("semaphore"), nb::arg("proxy"), nb::arg("dst"), nb::arg("src")) .def("device_handle", &PortChannel::deviceHandle); - nb::class_(m, "PortChannelDeviceHandle") + nb::class_(m, "CppPortChannelDeviceHandle") .def(nb::init<>()) .def_rw("semaphore_id_", &PortChannel::DeviceHandle::semaphoreId_) .def_rw("semaphore_", &PortChannel::DeviceHandle::semaphore_) diff --git a/python/csrc/semaphore_py.cpp b/python/csrc/semaphore_py.cpp index 665d395e..36d559f2 100644 --- a/python/csrc/semaphore_py.cpp +++ b/python/csrc/semaphore_py.cpp @@ -10,7 +10,7 @@ namespace nb = nanobind; using namespace mscclpp; void register_semaphore(nb::module_& m) { - nb::class_ host2DeviceSemaphore(m, "Host2DeviceSemaphore"); + nb::class_ host2DeviceSemaphore(m, "CppHost2DeviceSemaphore"); host2DeviceSemaphore.def(nb::init(), nb::arg("semaphore")) .def(nb::init(), nb::arg("communicator"), nb::arg("connection")) .def("connection", &Host2DeviceSemaphore::connection) @@ -25,7 +25,7 @@ void register_semaphore(nb::module_& m) { return nb::bytes(reinterpret_cast(&self), sizeof(self)); }); - nb::class_(m, "Host2HostSemaphore") + nb::class_(m, "CppHost2HostSemaphore") .def(nb::init(), nb::arg("semaphore")) .def(nb::init(), nb::arg("communicator"), nb::arg("connection")) .def("connection", &Host2HostSemaphore::connection) @@ -34,7 +34,7 @@ void register_semaphore(nb::module_& m) { .def("wait", &Host2HostSemaphore::wait, nb::call_guard(), nb::arg("max_spin_count") = 10000000); - nb::class_ memoryDevice2DeviceSemaphore(m, "MemoryDevice2DeviceSemaphore"); + nb::class_ memoryDevice2DeviceSemaphore(m, "CppMemoryDevice2DeviceSemaphore"); memoryDevice2DeviceSemaphore.def(nb::init(), nb::arg("semaphore")) .def(nb::init(), nb::arg("communicator"), nb::arg("connection")) .def("connection", &MemoryDevice2DeviceSemaphore::connection) diff --git a/python/csrc/switch_channel_py.cpp b/python/csrc/switch_channel_py.cpp index dd72c97e..2d0340dd 100644 --- a/python/csrc/switch_channel_py.cpp +++ b/python/csrc/switch_channel_py.cpp @@ -15,11 +15,11 @@ namespace nb = nanobind; using namespace mscclpp; void register_nvls(nb::module_& m) { - nb::class_(m, "SwitchChannel") + nb::class_(m, "CppSwitchChannel") .def("get_device_ptr", [](SwitchChannel* self) { return (uintptr_t)self->getDevicePtr(); }) .def("device_handle", &SwitchChannel::deviceHandle); - nb::class_(m, "DeviceHandle") + nb::class_(m, "CppSwitchChannelDeviceHandle") .def(nb::init<>()) .def_rw("device_ptr", &SwitchChannel::DeviceHandle::devicePtr) .def_rw("mc_ptr", &SwitchChannel::DeviceHandle::mcPtr) @@ -28,7 +28,7 @@ void register_nvls(nb::module_& m) { return nb::bytes(reinterpret_cast(&self), sizeof(self)); }); - nb::class_(m, "NvlsConnection") + nb::class_(m, "CppNvlsConnection") .def("bind_allocated_memory", &NvlsConnection::bindAllocatedMemory, nb::arg("device_ptr"), nb::arg("size")); m.def("connect_nvls_collective", &connectNvlsCollective, nb::arg("communicator"), nb::arg("all_ranks"), diff --git a/python/mscclpp/__init__.py b/python/mscclpp/__init__.py index 58233a7c..86923003 100644 --- a/python/mscclpp/__init__.py +++ b/python/mscclpp/__init__.py @@ -23,35 +23,35 @@ version = { from ._core import * from ._mscclpp import ( - Device, - DeviceType, - Communicator, - Connection, + CppDevice as Device, + CppDeviceType as DeviceType, + CppCommunicator as Communicator, + CppConnection as Connection, connect_nvls_collective, - EndpointConfig, - Fifo, - Semaphore, - Host2DeviceSemaphore, - Host2HostSemaphore, - numa, - ProxyService, - RegisteredMemory, - PortChannel, - MemoryChannel, - MemoryDevice2DeviceSemaphore, - TcpBootstrap, - Transport, - TransportFlags, - DataType, - ErrorCode, - Executor, - ExecutionPlan, - PacketType, - RawGpuBuffer, - ReduceOp, + CppEndpointConfig as EndpointConfig, + CppFifo as Fifo, + CppSemaphore as Semaphore, + CppHost2DeviceSemaphore as Host2DeviceSemaphore, + CppHost2HostSemaphore as Host2HostSemaphore, + cpp_numa as numa, + CppProxyService as ProxyService, + CppRegisteredMemory as RegisteredMemory, + CppPortChannel as PortChannel, + CppMemoryChannel as MemoryChannel, + CppMemoryDevice2DeviceSemaphore as MemoryDevice2DeviceSemaphore, + CppTcpBootstrap as TcpBootstrap, + CppTransport as Transport, + CppTransportFlags as TransportFlags, + CppDataType as DataType, + CppErrorCode as ErrorCode, + CppExecutor as Executor, + CppExecutionPlan as ExecutionPlan, + CppPacketType as PacketType, + CppRawGpuBuffer as RawGpuBuffer, + CppReduceOp as ReduceOp, env, is_nvls_supported, - npkit, + cpp_npkit as npkit, ) __all__ = [ diff --git a/python/mscclpp/__main__.py b/python/mscclpp/__main__.py index 6d0e0108..d57cb362 100644 --- a/python/mscclpp/__main__.py +++ b/python/mscclpp/__main__.py @@ -6,7 +6,7 @@ import shutil import argparse from pathlib import Path -from mscclpp.language import default_algos as def_algo +from mscclpp import default_algos as def_algo from mscclpp.language.collectives import * from mscclpp.language.utils import AlgoSpec @@ -57,7 +57,7 @@ default_algo_configs = [ def create_default_plans(): - plan_dir = os.environ.get("MSCCLPP_EXECUTION_PLAN_DIR", Path.home() / ".cache/mscclpp_default") + plan_dir = os.environ.get("MSCCLPP_CACHE_DIR", Path.home() / ".cache/mscclpp/default") plan_path = Path(plan_dir) if plan_path.exists(): shutil.rmtree(plan_path) diff --git a/python/mscclpp/_core/__init__.py b/python/mscclpp/_core/__init__.py index e9d886f3..a97c91a0 100644 --- a/python/mscclpp/_core/__init__.py +++ b/python/mscclpp/_core/__init__.py @@ -5,9 +5,3 @@ from .algorithm import * from .comm import * from .compiler import * from .buffer import * - -__all__ = [] -__all__ += algorithm.__all__ -__all__ += comm.__all__ -__all__ += compiler.__all__ -__all__ += buffer.__all__ diff --git a/python/mscclpp/_core/algorithm.py b/python/mscclpp/_core/algorithm.py index abaac60c..6c4a3f20 100644 --- a/python/mscclpp/_core/algorithm.py +++ b/python/mscclpp/_core/algorithm.py @@ -7,15 +7,17 @@ from functools import cached_property from mscclpp._mscclpp import ( - Algorithm as _Algorithm, - DslAlgorithm as _DslAlgorithm, - AlgorithmType as _AlgorithmType, - Communicator, - CollectiveBufferMode, - DataType, - Executor, - ExecutionPlan, - ReduceOp, + CppAlgorithm, + CppDslAlgorithm, + CppAlgorithmType, + CppCommunicator, + CppCollectiveBufferMode, + CppDataType, + CppExecutor, + CppExecutionPlan, + CppReduceOp, + CppAlgorithmBuilder, + CppAlgorithmCollection, ) __all__ = ["Algorithm", "AlgorithmBuilder", "AlgorithmCollection"] @@ -45,7 +47,7 @@ class Algorithm: """ def __init__(self, world_size: int = 0, n_ranks_per_node: int = 0): - self._constraint = _Algorithm.Constraint(world_size, n_ranks_per_node) + self._constraint = CppAlgorithm.Constraint(world_size, n_ranks_per_node) @property def world_size(self) -> int: @@ -58,23 +60,23 @@ class Algorithm: def __init__( self, id: Optional[str] = None, - execution_plan: Optional[ExecutionPlan] = None, - native_handle: Optional[_Algorithm] = None, + execution_plan: Optional[CppExecutionPlan] = None, + native_handle: Optional[CppAlgorithm] = None, tags: Optional[Dict[str, int]] = None, constraint: Optional[Constraint] = None, ): if execution_plan is not None: - self._algorithm = _DslAlgorithm( + self._algorithm = CppDslAlgorithm( id, execution_plan, tags=tags if tags is not None else {}, - constraint=constraint._constraint if constraint is not None else _Algorithm.Constraint(), + constraint=constraint._constraint if constraint is not None else CppAlgorithm.Constraint(), ) elif native_handle is not None: self._algorithm = native_handle @classmethod - def create_from_native_handle(cls, handle: _Algorithm): + def create_from_native_handle(cls, handle: CppAlgorithm): """Create an Algorithm instance from a native C++ algorithm handle. Args: @@ -97,7 +99,7 @@ class Algorithm: Returns: A new Algorithm instance wrapping the algorithm from the capsule. """ - handle = _Algorithm.from_native_capsule(obj) + handle = CppAlgorithm.from_native_capsule(obj) return cls(native_handle=handle) @cached_property @@ -121,7 +123,7 @@ class Algorithm: return self._algorithm.tags @cached_property - def buffer_mode(self) -> CollectiveBufferMode: + def buffer_mode(self) -> CppCollectiveBufferMode: """The buffer mode supported by this algorithm (IN_PLACE, OUT_OF_PLACE, or ANY).""" return self._algorithm.buffer_mode @@ -131,7 +133,7 @@ class Algorithm: Returns: True if this algorithm is defined using DSL/execution plan, False otherwise. """ - if self._algorithm.type == _AlgorithmType.DSL: + if self._algorithm.type == CppAlgorithmType.DSL: return True return False @@ -141,21 +143,21 @@ class Algorithm: Returns: True if this algorithm is implemented natively, False otherwise. """ - if self._algorithm.type == _AlgorithmType.NATIVE: + if self._algorithm.type == CppAlgorithmType.NATIVE: return True return False def execute( self, - comm: Communicator, + comm: CppCommunicator, input_buffer: int, output_buffer: int, input_size: int, output_size: int, - dtype: DataType, - op: ReduceOp = ReduceOp.NOP, + dtype: CppDataType, + op: CppReduceOp = CppReduceOp.NOP, stream: int = 0, - executor: Optional[Executor] = None, + executor: Optional[CppExecutor] = None, nblocks=0, nthreads_per_block=0, extras: Optional[Dict[str, int]] = None, @@ -196,7 +198,7 @@ class Algorithm: class AlgorithmBuilder: - def __init__(self, algorithm_builder: _AlgorithmBuilder): + def __init__(self, algorithm_builder: CppAlgorithmBuilder): self._algorithm_builder = algorithm_builder def build(self) -> Algorithm: @@ -204,7 +206,7 @@ class AlgorithmBuilder: class AlgorithmCollection: - def __init__(self, native_collection: _AlgorithmCollection): + def __init__(self, native_collection: CppAlgorithmCollection): self._native_collection = native_collection self._algorithms = [Algorithm.create_from_native_handle(algo) for algo in self._native_collection.to_list()] diff --git a/python/mscclpp/_core/buffer.py b/python/mscclpp/_core/buffer.py index b54342ea..0575ca68 100644 --- a/python/mscclpp/_core/buffer.py +++ b/python/mscclpp/_core/buffer.py @@ -6,7 +6,7 @@ from typing import Union, Tuple import cupy as cp import numpy as np -from mscclpp._mscclpp import RawGpuBuffer +from mscclpp._mscclpp import CppRawGpuBuffer __all__ = ["GpuBuffer"] @@ -25,6 +25,6 @@ class GpuBuffer(cp.ndarray): if any(s <= 0 for s in shape): raise ValueError("Shape must be positive.") # Create the buffer - buffer = RawGpuBuffer(np.prod(shape) * np.dtype(dtype).itemsize) + buffer = CppRawGpuBuffer(np.prod(shape) * np.dtype(dtype).itemsize) memptr = cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(buffer.data(), buffer.bytes(), buffer), 0) return cp.ndarray(shape, dtype=dtype, strides=strides, order=order, memptr=memptr) diff --git a/python/mscclpp/_core/comm.py b/python/mscclpp/_core/comm.py index 2b5a5f25..f0c5c219 100644 --- a/python/mscclpp/_core/comm.py +++ b/python/mscclpp/_core/comm.py @@ -6,18 +6,18 @@ from typing import Type import cupy as cp from mscclpp._mscclpp import ( - Communicator, - Connection, + CppCommunicator, + CppConnection, connect_nvls_collective, - EndpointConfig, - Semaphore, - ProxyService, - RegisteredMemory, - PortChannel, - MemoryChannel, - TcpBootstrap, - Transport, - TransportFlags, + CppEndpointConfig, + CppSemaphore, + CppProxyService, + CppRegisteredMemory, + CppPortChannel, + CppMemoryChannel, + CppTcpBootstrap, + CppTransport, + CppTransportFlags, ) import mpi4py import numpy as np @@ -32,7 +32,7 @@ class CommGroup: self, mpi_comm: mpi4py.MPI.Comm = None, interfaceIpPortTrio: str = "", rank: int = None, size: int = None ): if interfaceIpPortTrio == "": - self.bootstrap = TcpBootstrap.create(mpi_comm.rank, mpi_comm.size) + self.bootstrap = CppTcpBootstrap.create(mpi_comm.rank, mpi_comm.size) uniq_id = None if mpi_comm.rank == 0: # similar to NCCL's unique id @@ -41,15 +41,15 @@ class CommGroup: self.bootstrap.initialize(uniq_id_global) elif mpi_comm: # use this instead - self.bootstrap = TcpBootstrap.create(mpi_comm.rank, mpi_comm.size) + self.bootstrap = CppTcpBootstrap.create(mpi_comm.rank, mpi_comm.size) self.bootstrap.initialize(interfaceIpPortTrio) elif not interfaceIpPortTrio == "": assert rank >= 0 and size >= 1 - self.bootstrap = TcpBootstrap.create(rank, size) + self.bootstrap = CppTcpBootstrap.create(rank, size) self.bootstrap.initialize(interfaceIpPortTrio) else: raise RuntimeError("Either the interface or mpi_group need to be specified") - self.communicator = Communicator(self.bootstrap) + self.communicator = CppCommunicator(self.bootstrap) self.my_rank = self.bootstrap.get_rank() self.nranks = self.bootstrap.get_n_ranks() self.nranks_per_node = self.bootstrap.get_n_ranks_per_node() @@ -63,43 +63,43 @@ class CommGroup: def recv(self, tensor: np.ndarray, peer: int, tag: int): self.bootstrap.recv(tensor.ctypes.data, tensor.size * tensor.itemsize, peer, tag) - def my_ib_device(self, local_rank: int) -> Transport: + def my_ib_device(self, local_rank: int) -> CppTransport: if local_rank == 0: - return Transport.IB0 + return CppTransport.IB0 if local_rank == 1: - return Transport.IB1 + return CppTransport.IB1 if local_rank == 2: - return Transport.IB2 + return CppTransport.IB2 if local_rank == 3: - return Transport.IB3 + return CppTransport.IB3 if local_rank == 4: - return Transport.IB4 + return CppTransport.IB4 if local_rank == 5: - return Transport.IB5 + return CppTransport.IB5 if local_rank == 6: - return Transport.IB6 + return CppTransport.IB6 if local_rank == 7: - return Transport.IB7 + return CppTransport.IB7 else: assert False # only 8 IBs are supported def make_connection( self, all_ranks: list[int], - endpoints: EndpointConfig | Transport | dict[int, EndpointConfig] | dict[int, Transport], + endpoints: CppEndpointConfig | CppTransport | dict[int, CppEndpointConfig] | dict[int, CppTransport], use_switch: bool = False, - ) -> dict[int, Connection]: - if type(endpoints) is Transport: - endpoints = EndpointConfig(endpoints) + ) -> dict[int, CppConnection]: + if type(endpoints) is CppTransport: + endpoints = CppEndpointConfig(endpoints) elif type(endpoints) is dict: - endpoints = {k: EndpointConfig(v) if type(v) is Transport else v for k, v in endpoints.items()} + endpoints = {k: CppEndpointConfig(v) if type(v) is CppTransport else v for k, v in endpoints.items()} connections = {} for rank in all_ranks: if type(endpoints) is dict: endpoint = endpoints[rank] else: endpoint = endpoints - if endpoint.transport == Transport.CudaIpc and use_switch: + if endpoint.transport == CppTransport.CudaIpc and use_switch: return connect_nvls_collective(self.communicator, all_ranks, 2**30) else: connections[rank] = self.communicator.connect(endpoint, rank) @@ -107,8 +107,8 @@ class CommGroup: return connections def register_tensor_with_connections( - self, tensor: Type[cp.ndarray] | Type[np.ndarray], connections: dict[int, Connection] - ) -> dict[int, RegisteredMemory]: + self, tensor: Type[cp.ndarray] | Type[np.ndarray], connections: dict[int, CppConnection] + ) -> dict[int, CppRegisteredMemory]: local_reg_memory = self.register_local_memory(tensor, connections) all_registered_memories = {} all_registered_memories[self.my_rank] = local_reg_memory @@ -121,8 +121,8 @@ class CommGroup: return all_registered_memories def _register_memory_with_connections( - self, memory: RegisteredMemory, connections: dict[int, Connection] - ) -> dict[int, RegisteredMemory]: + self, memory: CppRegisteredMemory, connections: dict[int, CppConnection] + ) -> dict[int, CppRegisteredMemory]: all_registered_memories = {} all_registered_memories[self.my_rank] = memory future_memories = {} @@ -133,18 +133,20 @@ class CommGroup: all_registered_memories[rank] = future_memories[rank].get() return all_registered_memories - def make_semaphores(self, connections: dict[int, Connection]) -> dict[int, Semaphore]: + def make_semaphores(self, connections: dict[int, CppConnection]) -> dict[int, CppSemaphore]: future_semaphores = {} for rank in connections: future_semaphores[rank] = self.communicator.build_semaphore(connections[rank], rank) return {rank: future.get() for rank, future in future_semaphores.items()} - def make_memory_channels(self, tensor: cp.ndarray, connections: dict[int, Connection]) -> dict[int, MemoryChannel]: + def make_memory_channels( + self, tensor: cp.ndarray, connections: dict[int, CppConnection] + ) -> dict[int, CppMemoryChannel]: semaphores = self.make_semaphores(connections) registered_memories = self.register_tensor_with_connections(tensor, connections) channels = {} for rank in connections: - channels[rank] = MemoryChannel( + channels[rank] = CppMemoryChannel( semaphores[rank], registered_memories[rank], registered_memories[self.my_rank] ) return channels @@ -152,9 +154,9 @@ class CommGroup: def make_memory_channels_with_scratch( self, tensor: cp.ndarray, - registeredScratchBuffer: RegisteredMemory, - connections: dict[int, Connection], - ) -> dict[int, MemoryChannel]: + registeredScratchBuffer: CppRegisteredMemory, + connections: dict[int, CppConnection], + ) -> dict[int, CppMemoryChannel]: semaphores = self.make_semaphores(connections) registered_memories = self._register_memory_with_connections(registeredScratchBuffer, connections) channels = {} @@ -162,17 +164,17 @@ class CommGroup: tensor_size = ( tensor.numel() * tensor.element_size() if is_torch_tensor(tensor) else tensor.size * tensor.itemsize ) - local_registered_memory = self.communicator.register_memory(tensor_data_ptr, tensor_size, TransportFlags()) + local_registered_memory = self.communicator.register_memory(tensor_data_ptr, tensor_size, CppTransportFlags()) scratch_data_ptr = registeredScratchBuffer.data() for rank in connections: - channels[rank] = MemoryChannel( + channels[rank] = CppMemoryChannel( semaphores[rank], registered_memories[rank], local_registered_memory, scratch_data_ptr ) return channels def make_port_channels( - self, proxy_service: ProxyService, tensor: cp.ndarray, connections: dict[int, Connection] - ) -> dict[int, PortChannel]: + self, proxy_service: CppProxyService, tensor: cp.ndarray, connections: dict[int, CppConnection] + ) -> dict[int, CppPortChannel]: semaphores = self.make_semaphores(connections) registered_memories = self.register_tensor_with_connections(tensor, connections) memory_ids = {} @@ -188,12 +190,12 @@ class CommGroup: def make_port_channels_with_scratch( self, - proxy_service: ProxyService, + proxy_service: CppProxyService, tensor: cp.ndarray, - registeredScratchBuffer: RegisteredMemory, - connections: dict[int, Connection], - ) -> dict[int, PortChannel]: - transport_flags = TransportFlags() + registeredScratchBuffer: CppRegisteredMemory, + connections: dict[int, CppConnection], + ) -> dict[int, CppPortChannel]: + transport_flags = CppTransportFlags() for rank in connections: transport_flags |= connections[rank].transport() data_ptr = ( @@ -223,8 +225,8 @@ class CommGroup: return channels def register_semaphore_with_proxy( - self, proxy_service: ProxyService, connections: dict[int, Connection] - ) -> dict[int, PortChannel]: + self, proxy_service: CppProxyService, connections: dict[int, CppConnection] + ) -> dict[int, CppPortChannel]: semaphores = self.make_semaphores(connections) semaphore_ids = {} for rank in semaphores: @@ -235,7 +237,7 @@ class CommGroup: return channels def register_memory_with_proxy( - self, proxy_service: ProxyService, tensor: cp.ndarray, connections: dict[int, Connection] + self, proxy_service: CppProxyService, tensor: cp.ndarray, connections: dict[int, CppConnection] ) -> dict[int, int]: registered_memories = self.register_tensor_with_connections(tensor, connections) memory_ids = {} @@ -243,8 +245,8 @@ class CommGroup: memory_ids[rank] = proxy_service.add_memory(registered_memories[rank]) return memory_ids - def register_local_memory(self, tensor: cp.ndarray, connections: dict[int, Connection]) -> RegisteredMemory: - transport_flags = TransportFlags() + def register_local_memory(self, tensor: cp.ndarray, connections: dict[int, CppConnection]) -> CppRegisteredMemory: + transport_flags = CppTransportFlags() for rank in connections: transport_flags |= connections[rank].transport() data_ptr = ( diff --git a/python/mscclpp/_core/compiler.py b/python/mscclpp/_core/compiler.py index 82ae93a9..b2da976d 100644 --- a/python/mscclpp/_core/compiler.py +++ b/python/mscclpp/_core/compiler.py @@ -26,9 +26,7 @@ from mscclpp.language.program import CollectiveProgram from mscclpp.language.utils import AlgoSpec from mscclpp.utils import get_device_arch -from mscclpp._mscclpp import ( - ExecutionPlan, -) +from mscclpp._mscclpp import CppExecutionPlan, env logging.basicConfig(level=logging.INFO) @@ -51,7 +49,7 @@ class DslCompiler: into execution plans that can be run on GPUs. The compiled plans are cached to disk for reuse. - The cache location can be configured via the `MSCCLPP_EXECUTION_PLAN_DIR` + The cache location can be configured via the `MSCCLPP_CACHE_DIR` environment variable (defaults to `~/.cache/mscclpp`). Example: @@ -138,7 +136,7 @@ class DslCompiler: ) ).hexdigest() - plan_dir = os.environ.get("MSCCLPP_EXECUTION_PLAN_DIR", Path.home() / ".cache/mscclpp") + plan_dir = Path(env().cache_dir) os.makedirs(plan_dir, exist_ok=True) filename = f"{plan_id}.json" plan_path = os.path.join(plan_dir, filename) @@ -157,7 +155,7 @@ class DslCompiler: os.remove(tmp_path) except Exception: Path(plan_path).unlink(missing_ok=True) - execution_plan = ExecutionPlan(plan_path, rank) + execution_plan = CppExecutionPlan(plan_path, rank) return Algorithm( id=plan_id, execution_plan=execution_plan, @@ -179,8 +177,8 @@ class NativeCodeCompiler: based on the runtime environment. Compiled modules are cached to avoid recompilation. - The cache location can be configured via the `MSCCLPP_NATIVE_CACHE_DIR` - environment variable (defaults to `~/.cache/mscclpp/native`). + The cache location can be configured via the `MSCCLPP_CACHE_DIR` + environment variable (defaults to `~/.cache/mscclpp`). Attributes: _is_hip: True if running on AMD/ROCm, False for NVIDIA/CUDA. @@ -226,8 +224,7 @@ class NativeCodeCompiler: "-L" + os.path.join(self._lib_home, "lib"), "-lmscclpp", ] - cache_root = os.environ.get("MSCCLPP_NATIVE_CACHE_DIR", Path.home() / ".cache/mscclpp/native") - self._cache_dir = Path(cache_root) + self._cache_dir = Path(env().cache_dir) / "native" self._cache_dir.mkdir(parents=True, exist_ok=True) def _get_compiler(self) -> str: @@ -283,7 +280,7 @@ class NativeCodeCompiler: Note: - The source file should include pybind11 bindings to expose functions. - MSCCLPP headers are automatically included in the compilation. - - The module is cached in `MSCCLPP_NATIVE_CACHE_DIR` (default: ~/.cache/mscclpp/native). + - The module is cached in `MSCCLPP_CACHE_DIR` (default: ~/.cache/mscclpp). - File locking is used to prevent race conditions during parallel compilation. Example: diff --git a/python/mscclpp/ext/__init__.py b/python/mscclpp/ext/__init__.py index 5c73df3c..08a96ecd 100644 --- a/python/mscclpp/ext/__init__.py +++ b/python/mscclpp/ext/__init__.py @@ -2,5 +2,3 @@ # Licensed under the MIT license. from .algorithm_collection_builder import * - -__all__ = algorithm_collection_builder.__all__ diff --git a/python/mscclpp/ext/algorithm_collection_builder.py b/python/mscclpp/ext/algorithm_collection_builder.py index 51a178fb..8361bd2f 100644 --- a/python/mscclpp/ext/algorithm_collection_builder.py +++ b/python/mscclpp/ext/algorithm_collection_builder.py @@ -6,9 +6,7 @@ from typing import Union from mscclpp._core.algorithm import Algorithm, AlgorithmBuilder, AlgorithmCollection import atexit -from mscclpp._mscclpp import ( - AlgorithmCollectionBuilder as _AlgorithmCollectionBuilder, -) +from mscclpp._mscclpp import CppAlgorithmCollectionBuilder __all__ = ["AlgorithmCollectionBuilder"] @@ -24,12 +22,12 @@ class AlgorithmCollectionBuilder: @classmethod def reset(cls): if cls._instance is not None: - _AlgorithmCollectionBuilder.reset() + CppAlgorithmCollectionBuilder.reset() cls._instance = None def __init__(self): if not hasattr(self, "_initialized"): - self._builder = _AlgorithmCollectionBuilder.get_instance() + self._builder = CppAlgorithmCollectionBuilder.get_instance() self._initialized = True def add_algorithm_builder(self, algorithm_builder: Union[AlgorithmBuilder, Algorithm]): diff --git a/python/mscclpp/utils.py b/python/mscclpp/utils.py index 783b0ca9..69dd7ce6 100644 --- a/python/mscclpp/utils.py +++ b/python/mscclpp/utils.py @@ -11,7 +11,7 @@ from typing import Any, Type, Union import cupy as cp import numpy as np -from mscclpp._mscclpp import DataType +from mscclpp._mscclpp import CppDataType as DataType try: import torch diff --git a/src/core/env.cpp b/src/core/env.cpp index 35a31f4c..508208e9 100644 --- a/src/core/env.cpp +++ b/src/core/env.cpp @@ -58,8 +58,7 @@ Env::Env() socketFamily(readEnv("MSCCLPP_SOCKET_FAMILY", "")), socketIfname(readEnv("MSCCLPP_SOCKET_IFNAME", "")), commId(readEnv("MSCCLPP_COMM_ID", "")), - executionPlanDir(readEnv("MSCCLPP_EXECUTION_PLAN_DIR", - readEnv("HOME", "~") + "/.cache/mscclpp_default")), + cacheDir(readEnv("MSCCLPP_CACHE_DIR", readEnv("HOME", "~") + "/.cache/mscclpp")), npkitDumpDir(readEnv("MSCCLPP_NPKIT_DUMP_DIR", "")), cudaIpcUseDefaultStream(readEnv("MSCCLPP_CUDAIPC_USE_DEFAULT_STREAM", false)), ncclSharedLibPath(readEnv("MSCCLPP_NCCL_LIB_PATH", "")), @@ -85,7 +84,7 @@ std::shared_ptr env() { logEnv("MSCCLPP_SOCKET_FAMILY", globalEnv->socketFamily); logEnv("MSCCLPP_SOCKET_IFNAME", globalEnv->socketIfname); logEnv("MSCCLPP_COMM_ID", globalEnv->commId); - logEnv("MSCCLPP_EXECUTION_PLAN_DIR", globalEnv->executionPlanDir); + logEnv("MSCCLPP_CACHE_DIR", globalEnv->cacheDir); logEnv("MSCCLPP_NPKIT_DUMP_DIR", globalEnv->npkitDumpDir); logEnv("MSCCLPP_CUDAIPC_USE_DEFAULT_STREAM", globalEnv->cudaIpcUseDefaultStream); logEnv("MSCCLPP_NCCL_LIB_PATH", globalEnv->ncclSharedLibPath); diff --git a/src/ext/collectives/algorithm_collection_builder.cc b/src/ext/collectives/algorithm_collection_builder.cc index 566c1852..67e616ae 100644 --- a/src/ext/collectives/algorithm_collection_builder.cc +++ b/src/ext/collectives/algorithm_collection_builder.cc @@ -105,13 +105,13 @@ AlgorithmCollection AlgorithmCollectionBuilder::buildDefaultDslAlgorithms(int ra return oss.str(); }; - std::string planDir = env()->executionPlanDir; + auto planDir = std::filesystem::path(env()->cacheDir) / "default"; if (!std::filesystem::exists(planDir)) { - INFO(ALGO, "Plan directory does not exist: ", planDir); + INFO(ALGO, "Default plan directory does not exist: ", planDir); return collection; } for (const auto& config : defaultAlgoConfigs) { - std::string planPath = planDir + "/" + config.filename; + auto planPath = planDir / config.filename; INFO(ALGO, "Loading plan: ", planPath); if (!std::filesystem::exists(planPath)) { INFO(ALGO, "Plan file does not exist: ", planPath); From dc747b15222b7eab3ab710f1594e90aecafbadde Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Thu, 5 Feb 2026 09:23:43 -0800 Subject: [PATCH 04/52] Refactor reduce kernel (#738) - Put the common reduce kernel to reduce_kernel.hpp - Implement operator overloading for the vector type - Clean up the duplicated code at `executor_ kernel.hpp` and `allreduce/common.hpp` --- include/mscclpp/gpu_data_types.hpp | 456 ++++++++++++++++-- include/mscclpp/switch_channel_device.hpp | 24 +- src/core/include/execution_kernel.hpp | 379 +-------------- src/core/include/reduce_kernel.hpp | 81 ++++ .../allreduce/allreduce_allpair_packet.cu | 2 +- .../allreduce/allreduce_fullmesh.cu | 4 +- .../allreduce/allreduce_nvls_packet.cu | 2 +- .../collectives/allreduce/allreduce_packet.cu | 4 +- .../collectives/include/allreduce/common.hpp | 441 +---------------- 9 files changed, 538 insertions(+), 855 deletions(-) create mode 100644 src/core/include/reduce_kernel.hpp diff --git a/include/mscclpp/gpu_data_types.hpp b/include/mscclpp/gpu_data_types.hpp index 99b95d9a..9e7747a8 100644 --- a/include/mscclpp/gpu_data_types.hpp +++ b/include/mscclpp/gpu_data_types.hpp @@ -16,20 +16,27 @@ using __bfloat16 = __hip_bfloat16; using __bfloat162 = __hip_bfloat162; #define __CUDA_BF16_TYPES_EXIST__ -// AMD FP8 support - hip_fp8.h provides __hip_fp8_e4m3_fnuz and __hip_fp8_e5m2_fnuz -// Only available on gfx942 and newer architectures (ROCm 6.0+) +// AMD FP8 support - Use fnuz types for HIP 6.0 or when HIP_FP8_TYPE_FNUZ is enabled and HIP_FP8_TYPE_OCP is not +// enabled. Otherwise, use the standard FP8 types. #if defined(HIP_VERSION_MAJOR) && (HIP_VERSION_MAJOR >= 6) #include // Create aliases matching CUDA naming convention for cross-platform compatibility +#if (HIP_VERSION_MAJOR == 6) || (HIP_VERSION_MAJOR > 6 && HIP_FP8_TYPE_FNUZ && !HIP_FP8_TYPE_OCP) using __fp8_e4m3 = __hip_fp8_e4m3_fnuz; using __fp8_e5m2 = __hip_fp8_e5m2_fnuz; - -// HIP FP8 vector types use storage types (from hip/amd_detail/amd_hip_fp8.h): -using __fp8x2_e4m3 = __hip_fp8x2_storage_t; // uint16_t -using __fp8x2_e5m2 = __hip_fp8x2_storage_t; // uint16_t -using __fp8x4_e4m3 = __hip_fp8x4_storage_t; // uint32_t -using __fp8x4_e5m2 = __hip_fp8x4_storage_t; // uint32_t +using __fp8x2_e4m3 = __hip_fp8x2_e4m3_fnuz; +using __fp8x2_e5m2 = __hip_fp8x2_e5m2_fnuz; +using __fp8x4_e4m3 = __hip_fp8x4_e4m3_fnuz; +using __fp8x4_e5m2 = __hip_fp8x4_e5m2_fnuz; +#else +using __fp8_e4m3 = __hip_fp8_e4m3; +using __fp8_e5m2 = __hip_fp8_e5m2; +using __fp8x2_e4m3 = __hip_fp8x2_e4m3; +using __fp8x2_e5m2 = __hip_fp8x2_e5m2; +using __fp8x4_e4m3 = __hip_fp8x4_e4m3; +using __fp8x4_e5m2 = __hip_fp8x4_e5m2; +#endif #define __FP8_TYPES_EXIST__ #endif // HIP_VERSION_MAJOR >= 6 @@ -71,10 +78,8 @@ enum class DataType { }; /// Word array. -template +template = 4 && Bytes % 4 == 0)> struct alignas(Bytes) Words { - static_assert(Bytes > 0, "Bytes must be greater than 0"); - static_assert(Bytes % 4 == 0, "Bytes must be multiple of 4"); uint32_t w[Bytes / 4]; MSCCLPP_HOST_DEVICE_INLINE Words() {} @@ -84,18 +89,33 @@ struct alignas(Bytes) Words { MSCCLPP_HOST_DEVICE_INLINE const uint32_t& operator[](int i) const { return w[i]; } }; -/// Vector type. -template -union alignas(sizeof(T) * N) VectorType { +template +struct alignas(Bytes) Words {}; + +/// Vector type implementation (internal). +template +union alignas(sizeof(T) * N) VectorTypeImpl { static_assert(N > 0, "N must be greater than 0"); T data[N]; Words words; + StorageT storage; using ElementType = T; constexpr static int Size = N; - MSCCLPP_HOST_DEVICE_INLINE VectorType() {} + MSCCLPP_HOST_DEVICE_INLINE VectorTypeImpl() {} + + MSCCLPP_HOST_DEVICE_INLINE VectorTypeImpl(const StorageT& value) : storage(value) {} + + MSCCLPP_HOST_DEVICE_INLINE VectorTypeImpl(const VectorTypeImpl& other) { storage = other.storage; } + + MSCCLPP_HOST_DEVICE_INLINE VectorTypeImpl& operator=(const VectorTypeImpl& other) { + storage = other.storage; + return *this; + } + + MSCCLPP_HOST_DEVICE_INLINE operator StorageT() const { return storage; } MSCCLPP_HOST_DEVICE_INLINE operator T*() { return data; } @@ -106,38 +126,394 @@ union alignas(sizeof(T) * N) VectorType { MSCCLPP_HOST_DEVICE_INLINE const T& operator[](int i) const { return data[i]; } }; -using i32x1 = VectorType; -using u32x1 = VectorType; -using f64x1 = VectorType; -using f32x1 = VectorType; +// Helper template to get the appropriate vector type for a given element type and count +template +struct VectorTypeHelper { + using type = + VectorTypeImpl>>; +}; -using i32x2 = VectorType; -using u32x2 = VectorType; -using f32x2 = VectorType; -using f16x2 = VectorType<__half, 2>; -using bf16x2 = VectorType<__bfloat16, 2>; +/// Vector type - clean user interface (automatically selects appropriate storage type) +template +using VectorType = typename VectorTypeHelper::type; -using i32x4 = VectorType; -using u32x4 = VectorType; -using f32x4 = VectorType; -using f16x4 = VectorType<__half, 4>; -using bf16x4 = VectorType<__bfloat16, 4>; +// Macro to define specialization AND alias in one go +#define DEFINE_VEC(Alias, T, N, Storage) \ + template <> \ + struct VectorTypeHelper { \ + using type = VectorTypeImpl; \ + }; \ + using Alias = VectorType -using f16x8 = VectorType<__half, 8>; -using bf16x8 = VectorType<__bfloat16, 8>; +DEFINE_VEC(i32x1, int32_t, 1, int32_t); +DEFINE_VEC(u32x1, uint32_t, 1, uint32_t); +DEFINE_VEC(f32x1, float, 1, float); +DEFINE_VEC(f64x1, double, 1, double); + +DEFINE_VEC(i32x2, int32_t, 2, int2); +DEFINE_VEC(u32x2, uint32_t, 2, uint2); +DEFINE_VEC(f32x2, float, 2, float2); +DEFINE_VEC(f16x2, __half, 2, __half2); +DEFINE_VEC(bf16x2, __bfloat16, 2, __bfloat162); + +DEFINE_VEC(i32x4, int32_t, 4, int4); +DEFINE_VEC(u32x4, uint32_t, 4, uint4); +DEFINE_VEC(f32x4, float, 4, float4); +DEFINE_VEC(f16x4, __half, 4, uint2); +DEFINE_VEC(bf16x4, __bfloat16, 4, uint2); + +DEFINE_VEC(f16x8, __half, 8, uint4); +DEFINE_VEC(bf16x8, __bfloat16, 8, uint4); #if defined(__FP8_TYPES_EXIST__) -// FP8 vector types -using fp8_e4m3x2 = VectorType<__fp8_e4m3, 2>; -using fp8_e4m3x4 = VectorType<__fp8_e4m3, 4>; -using fp8_e4m3x8 = VectorType<__fp8_e4m3, 8>; -using fp8_e4m3x16 = VectorType<__fp8_e4m3, 16>; -using fp8_e5m2x2 = VectorType<__fp8_e5m2, 2>; -using fp8_e5m2x4 = VectorType<__fp8_e5m2, 4>; -using fp8_e5m2x8 = VectorType<__fp8_e5m2, 8>; -using fp8_e5m2x16 = VectorType<__fp8_e5m2, 16>; +DEFINE_VEC(f8_e4m3x2, __fp8_e4m3, 2, __fp8x2_e4m3); +DEFINE_VEC(f8_e4m3x4, __fp8_e4m3, 4, __fp8x4_e4m3); +DEFINE_VEC(f8_e4m3x8, __fp8_e4m3, 8, uint2); +DEFINE_VEC(f8_e4m3x16, __fp8_e4m3, 16, uint4); + +DEFINE_VEC(f8_e5m2x2, __fp8_e5m2, 2, __fp8x2_e5m2); +DEFINE_VEC(f8_e5m2x4, __fp8_e5m2, 4, __fp8x4_e5m2); +DEFINE_VEC(f8_e5m2x8, __fp8_e5m2, 8, uint2); +DEFINE_VEC(f8_e5m2x16, __fp8_e5m2, 16, uint4); +#endif +#undef DEFINE_VEC + +#if defined(MSCCLPP_DEVICE_COMPILE) +template +MSCCLPP_DEVICE_INLINE To bit_cast(const From& src) { + static_assert(sizeof(To) == sizeof(From), "Size mismatch for bit_cast"); + + union { + From f; + To t; + } u{.f = src}; + return u.t; +} + +template +MSCCLPP_DEVICE_INLINE T clip(T val) { + return val; +} + +template <> +MSCCLPP_DEVICE_INLINE __half clip(__half val) { + val = __hmax(val, bit_cast<__half, unsigned short>(0xfbff)); + val = __hmin(val, bit_cast<__half, unsigned short>(0x7bff)); + + return val; +} + +template <> +MSCCLPP_DEVICE_INLINE __half2 clip(__half2 val) { + val.x = __hmax(val.x, bit_cast<__half, unsigned short>(0xfbff)); + val.x = __hmin(val.x, bit_cast<__half, unsigned short>(0x7bff)); + val.y = __hmax(val.y, bit_cast<__half, unsigned short>(0xfbff)); + val.y = __hmin(val.y, bit_cast<__half, unsigned short>(0x7bff)); + return val; +} + +template <> +MSCCLPP_DEVICE_INLINE __bfloat16 clip(__bfloat16 val) { + val = __hmax(val, bit_cast<__bfloat16, unsigned short>(0xff80)); + val = __hmin(val, bit_cast<__bfloat16, unsigned short>(0x7f80)); + return val; +} + +template <> +MSCCLPP_DEVICE_INLINE __bfloat162 clip(__bfloat162 val) { + val.x = __hmax(val.x, bit_cast<__bfloat16, unsigned short>(0xff80)); + val.x = __hmin(val.x, bit_cast<__bfloat16, unsigned short>(0x7f80)); + val.y = __hmax(val.y, bit_cast<__bfloat16, unsigned short>(0xff80)); + val.y = __hmin(val.y, bit_cast<__bfloat16, unsigned short>(0x7f80)); + return val; +} + +// FP8 E4M3 clipping function +#if defined(__FP8_TYPES_EXIST__) +template <> +MSCCLPP_DEVICE_INLINE __fp8_e4m3 clip(__fp8_e4m3 val) { + // FP8 E4M3 has range [-448, 448], no infinities + // Built-in saturation in FP8 arithmetic + return val; +} + +// FP8 E5M2 clipping function - prevent infinities by clamping to max finite value +template <> +MSCCLPP_DEVICE_INLINE __fp8_e5m2 clip(__fp8_e5m2 val) { + // FP8 E5M2 has infinities - clamp to max finite value to prevent overflow + // Max finite value for E5M2 is 57344.0f (0x7B), min is -57344.0f (0xFB) + float fval = float(val); + fval = fmaxf(fval, -57344.0f); + fval = fminf(fval, 57344.0f); + return __fp8_e5m2(fval); +} #endif +template +MSCCLPP_DEVICE_INLINE f16x2 operator+(const f16x2& a, const f16x2& b) { + __half2 result; + if constexpr (UseClip) { + result = clip(__hadd2(a, b)); + } else { + result = __hadd2(a, b); + } + return result; +} + +template +MSCCLPP_DEVICE_INLINE bf16x2 operator+(const bf16x2& a, const bf16x2& b) { + __bfloat162 result; + if constexpr (UseClip) { + result = clip(__hadd2(a, b)); + } else { + result = __hadd2(a, b); + } + return result; +} + +#if defined(__FP8_TYPES_EXIST__) +template +MSCCLPP_DEVICE_INLINE __fp8_e4m3 operator+(const __fp8_e4m3& a, const __fp8_e4m3& b) { +#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) + // Optimized assembly for gfx942 + float2 v; + uint32_t ival = 0; + asm volatile("v_pk_add_f32 %0, %1, %2" + : "=v"(v) + : "v"(__builtin_amdgcn_cvt_pk_f32_fp8(a.__x, 0)), "v"(__builtin_amdgcn_cvt_pk_f32_fp8(b.__x, 0))); + return static_cast<__hip_fp8_storage_t>(__builtin_amdgcn_cvt_pk_fp8_f32(v.x, v.x, ival, false)); +#elif defined(MSCCLPP_DEVICE_CUDA) + // NVIDIA CUDA FP8 addition (CUDA 11.8+) + __fp8_e4m3 result = __fp8_e4m3(__hadd(__half(a), __half(b))); + return UseClip ? clip(result) : result; +#else + // Fallback for other devices + __fp8_e4m3 result = __fp8_e4m3(float(a) + float(b)); + return UseClip ? clip(result) : result; +#endif +} + +template +MSCCLPP_DEVICE_INLINE f8_e4m3x2 operator+(const f8_e4m3x2& a, const f8_e4m3x2& b) { +#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) + float2 v; + uint32_t ival = 0; + asm volatile("v_pk_add_f32 %0, %1, %2" + : "=v"(v) + : "v"(__builtin_amdgcn_cvt_pk_f32_fp8(a.storage.__x, 0)), + "v"(__builtin_amdgcn_cvt_pk_f32_fp8(b.storage.__x, 0))); + return bit_cast( + static_cast<__hip_fp8x2_storage_t>(__builtin_amdgcn_cvt_pk_fp8_f32(v.x, v.y, ival, false))); +#elif defined(MSCCLPP_DEVICE_CUDA) + // CUDA: Convert to half2, add using optimized __hadd2, convert back + return __fp8x2_e4m3(__hadd2(__half2(static_cast<__fp8x2_e4m3>(a)), __half2(static_cast<__fp8x2_e4m3>(b)))); +#else + // Fallback for other devices: element-wise using single-element operations + f8_e4m3x2 result; + result.data[0] = a.data[0] + b.data[0]; + result.data[1] = a.data[1] + b.data[1]; + return result; +#endif +} + +template +MSCCLPP_DEVICE_INLINE f8_e4m3x4 operator+(const f8_e4m3x4& a, const f8_e4m3x4& b) { +#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) + float2 v_low, v_high; + // E4M3 using fp8 conversion - process low word (false) and high word (true) + asm volatile("v_pk_add_f32 %0, %1, %2" + : "=v"(v_low) + : "v"(__builtin_amdgcn_cvt_pk_f32_fp8(a.storage.__x, false)), + "v"(__builtin_amdgcn_cvt_pk_f32_fp8(b.storage.__x, false))); + uint32_t result_packed = __builtin_amdgcn_cvt_pk_fp8_f32(v_low.x, v_low.y, 0, false); + + asm volatile("v_pk_add_f32 %0, %1, %2" + : "=v"(v_high) + : "v"(__builtin_amdgcn_cvt_pk_f32_fp8(a.storage.__x, true)), + "v"(__builtin_amdgcn_cvt_pk_f32_fp8(b.storage.__x, true))); + result_packed = __builtin_amdgcn_cvt_pk_fp8_f32(v_high.x, v_high.y, result_packed, true); + return bit_cast(result_packed); +#else + // Process as two f8_e4m3x2 using operator+ for 2 elements + const f8_e4m3x2* a_pair = reinterpret_cast(&a); + const f8_e4m3x2* b_pair = reinterpret_cast(&b); + + f8_e4m3x2 result[2]; + result[0] = a_pair[0] + b_pair[0]; + result[1] = a_pair[1] + b_pair[1]; + + return *reinterpret_cast(result); +#endif +} + +template +MSCCLPP_DEVICE_INLINE __fp8_e5m2 operator+(const __fp8_e5m2& a, const __fp8_e5m2& b) { +#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) + // Optimized assembly for gfx942 (bfloat8) + float2 v; + uint32_t ival = 0; + asm volatile("v_pk_add_f32 %0, %1, %2" + : "=v"(v) + : "v"(__builtin_amdgcn_cvt_pk_f32_bf8(a.__x, 0)), "v"(__builtin_amdgcn_cvt_pk_f32_bf8(b.__x, 0))); + return static_cast<__hip_fp8_storage_t>(__builtin_amdgcn_cvt_pk_bf8_f32(v.x, v.x, ival, false)); +#elif defined(MSCCLPP_DEVICE_CUDA) + // NVIDIA CUDA FP8 addition + __fp8_e5m2 result = __fp8_e5m2(__hadd(__half(a), __half(b))); + return UseClip ? clip(result) : result; +#else + __fp8_e5m2 result = __fp8_e5m2(float(a) + float(b)); + return UseClip ? clip(result) : result; +#endif +} + +template +MSCCLPP_DEVICE_INLINE f8_e5m2x2 operator+(const f8_e5m2x2& a, const f8_e5m2x2& b) { +#if defined(MSCCLPP_DEVICE_CUDA) + // CUDA: Convert to half2, add using optimized __hadd2, convert back + f8_e5m2x2 result = + __fp8x2_e5m2(__hadd2(__half2(static_cast<__fp8x2_e5m2>(a)), __half2(static_cast<__fp8x2_e5m2>(b)))); + if constexpr (UseClip) { + result = clip(result); + } + return result; +#elif defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) + // HIP gfx942: Use BF8 assembly instructions + float2 v; + uint32_t ival = 0; + asm volatile("v_pk_add_f32 %0, %1, %2" + : "=v"(v) + : "v"(__builtin_amdgcn_cvt_pk_f32_bf8(a.data[0].__x, 0)), + "v"(__builtin_amdgcn_cvt_pk_f32_bf8(b.data[0].__x, 0))); + return bit_cast( + static_cast<__hip_fp8x2_storage_t>(__builtin_amdgcn_cvt_pk_bf8_f32(v.x, v.y, ival, false))); +#else + // Fallback: element-wise using single-element operations + f8_e5m2x2 result; + result.data[0] = a.data[0] + b.data[0]; + result.data[1] = a.data[1] + b.data[1]; + return result; +#endif +} + +template +MSCCLPP_DEVICE_INLINE f8_e5m2x4 operator+(const f8_e5m2x4& a, const f8_e5m2x4& b) { +#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) + float2 v_low, v_high; + // E5M2 using bf8 conversion - process low word (false) and high word (true) + asm volatile("v_pk_add_f32 %0, %1, %2" + : "=v"(v_low) + : "v"(__builtin_amdgcn_cvt_pk_f32_bf8(a.storage.__x, false)), + "v"(__builtin_amdgcn_cvt_pk_f32_bf8(b.storage.__x, false))); + uint32_t result_packed = __builtin_amdgcn_cvt_pk_bf8_f32(v_low.x, v_low.y, 0, false); + + asm volatile("v_pk_add_f32 %0, %1, %2" + : "=v"(v_high) + : "v"(__builtin_amdgcn_cvt_pk_f32_bf8(a.storage.__x, true)), + "v"(__builtin_amdgcn_cvt_pk_f32_bf8(b.storage.__x, true))); + result_packed = __builtin_amdgcn_cvt_pk_bf8_f32(v_high.x, v_high.y, result_packed, true); + return bit_cast(result_packed); +#else + // Process as two f8_e5m2x2 using operator+ for 2 elements + const f8_e5m2x2* a_pair = reinterpret_cast(&a); + const f8_e5m2x2* b_pair = reinterpret_cast(&b); + f8_e5m2x2 result[2]; + result[0] = a_pair[0] + b_pair[0]; + result[1] = a_pair[1] + b_pair[1]; + + return *reinterpret_cast(result); +#endif +} +#endif // defined(__FP8_TYPES_EXIST__) + +template +MSCCLPP_DEVICE_INLINE T min(const T& a, const T& b) { + return (a < b ? a : b); +} + +template <> +MSCCLPP_DEVICE_INLINE f16x2 min(const f16x2& a, const f16x2& b) { +#if defined(MSCCLPP_DEVICE_HIP) + f16x2 val; + val[0] = __hmin(a[0], b[0]); + val[1] = __hmin(a[1], b[1]); + return val; +#else + __half2 ret = __hmin2(a, b); + return ret; +#endif +} + +template <> +MSCCLPP_DEVICE_INLINE bf16x2 min(const bf16x2& a, const bf16x2& b) { + return __hmin2(a, b); +} + +#if defined(__FP8_TYPES_EXIST__) +template <> +MSCCLPP_DEVICE_INLINE __fp8_e4m3 min(const __fp8_e4m3& a, const __fp8_e4m3& b) { +#if defined(MSCCLPP_DEVICE_HIP) + return __fp8_e4m3(fminf(float(a), float(b))); +#else + return __fp8_e4m3(__hmin(__half(a), __half(b))); +#endif +} + +MSCCLPP_DEVICE_INLINE f8_e4m3x2 min(const f8_e4m3x2& a, const f8_e4m3x2& b) { + // Process element-wise using single-element operations + f8_e4m3x2 result; + result.data[0] = mscclpp::min(a.data[0], b.data[0]); + result.data[1] = mscclpp::min(a.data[1], b.data[1]); + return result; +} + +MSCCLPP_DEVICE_INLINE f8_e4m3x4 min(const f8_e4m3x4& a, const f8_e4m3x4& b) { + // Process as two f8_e4m3x2 using min for 2 elements + const f8_e4m3x2* a_ptr = reinterpret_cast(&a); + const f8_e4m3x2* b_ptr = reinterpret_cast(&b); + + f8_e4m3x4 result; + f8_e4m3x2* result_ptr = reinterpret_cast(&result); + + result_ptr[0] = mscclpp::min(a_ptr[0], b_ptr[0]); + result_ptr[1] = mscclpp::min(a_ptr[1], b_ptr[1]); + + return result; +} + +template <> +MSCCLPP_DEVICE_INLINE __fp8_e5m2 min(const __fp8_e5m2& a, const __fp8_e5m2& b) { +#if defined(MSCCLPP_DEVICE_HIP) + return __fp8_e5m2(fminf(float(a), float(b))); +#else + return __fp8_e5m2(__hmin(__half(a), __half(b))); +#endif +} + +MSCCLPP_DEVICE_INLINE f8_e5m2x2 min(const f8_e5m2x2& a, const f8_e5m2x2& b) { + // Process element-wise using single-element operations + f8_e5m2x2 result; + result.data[0] = mscclpp::min(a.data[0], b.data[0]); + result.data[1] = mscclpp::min(a.data[1], b.data[1]); + return result; +} + +MSCCLPP_DEVICE_INLINE f8_e5m2x4 min(const f8_e5m2x4& a, const f8_e5m2x4& b) { + // Process as two f8_e5m2x2 using min for 2 elements + const f8_e5m2x2* a_ptr = reinterpret_cast(&a); + const f8_e5m2x2* b_ptr = reinterpret_cast(&b); + + f8_e5m2x4 result; + f8_e5m2x2* result_ptr = reinterpret_cast(&result); + + result_ptr[0] = mscclpp::min(a_ptr[0], b_ptr[0]); + result_ptr[1] = mscclpp::min(a_ptr[1], b_ptr[1]); + + return result; +} +#endif // defined(__FP8_TYPES_EXIST__) +#endif // MSCCLPP_DEVICE_COMPILE } // namespace mscclpp #endif // MSCCLPP_GPU_DATA_TYPES_HPP_ diff --git a/include/mscclpp/switch_channel_device.hpp b/include/mscclpp/switch_channel_device.hpp index 5f8a1608..b52b6572 100644 --- a/include/mscclpp/switch_channel_device.hpp +++ b/include/mscclpp/switch_channel_device.hpp @@ -80,26 +80,26 @@ struct SwitchChannelDeviceHandle { : "=r"(val.words[0]), "=r"(val.words[1]), "=r"(val.words[2]), "=r"(val.words[3]) : "l"(ptr) : "memory"); - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v) { asm("multimem.ld_reduce.relaxed.sys.global.add.e4m3x4 %0, [%1];" : "=r"(val.words[0]) : "l"(ptr) : "memory"); - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v) { asm("multimem.ld_reduce.relaxed.sys.global.add.v2.e4m3x4 {%0,%1}, [%2];" : "=r"(val.words[0]), "=r"(val.words[1]) : "l"(ptr) : "memory"); - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v) { asm("multimem.ld_reduce.relaxed.sys.global.add.v4.e4m3x4 {%0,%1,%2,%3}, [%4];" : "=r"(val.words[0]), "=r"(val.words[1]), "=r"(val.words[2]), "=r"(val.words[3]) : "l"(ptr) : "memory"); - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v) { asm("multimem.ld_reduce.relaxed.sys.global.add.e5m2x4 %0, [%1];" : "=r"(val.words[0]) : "l"(ptr) : "memory"); - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v) { asm("multimem.ld_reduce.relaxed.sys.global.add.v2.e5m2x4 {%0,%1}, [%2];" : "=r"(val.words[0]), "=r"(val.words[1]) : "l"(ptr) : "memory"); - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v) { asm("multimem.ld_reduce.relaxed.sys.global.add.v4.e5m2x4 {%0,%1,%2,%3}, [%4];" : "=r"(val.words[0]), "=r"(val.words[1]), "=r"(val.words[2]), "=r"(val.words[3]) : "l"(ptr) @@ -148,23 +148,23 @@ struct SwitchChannelDeviceHandle { asm volatile("multimem.st.relaxed.sys.global.v4.bf16x2 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.words[0]), "r"(val.words[1]), "r"(val.words[2]), "r"(val.words[3]) : "memory"); - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v) { asm volatile("multimem.st.relaxed.sys.global.e4m3x4 [%0], %1;" ::"l"(ptr), "r"(val.words[0]) : "memory"); - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v) { asm volatile("multimem.st.relaxed.sys.global.v2.e4m3x4 [%0], {%1,%2};" ::"l"(ptr), "r"(val.words[0]), "r"(val.words[1]) : "memory"); - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v) { asm volatile("multimem.st.relaxed.sys.global.v4.e4m3x4 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.words[0]), "r"(val.words[1]), "r"(val.words[2]), "r"(val.words[3]) : "memory"); - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v) { asm volatile("multimem.st.relaxed.sys.global.e5m2x4 [%0], %1;" ::"l"(ptr), "r"(val.words[0]) : "memory"); - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v) { asm volatile("multimem.st.relaxed.sys.global.v2.e5m2x4 [%0], {%1,%2};" ::"l"(ptr), "r"(val.words[0]), "r"(val.words[1]) : "memory"); - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v) { asm volatile("multimem.st.relaxed.sys.global.v4.e5m2x4 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.words[0]), "r"(val.words[1]), "r"(val.words[2]), "r"(val.words[3]) : "memory"); diff --git a/src/core/include/execution_kernel.hpp b/src/core/include/execution_kernel.hpp index fb6c436f..918bff61 100644 --- a/src/core/include/execution_kernel.hpp +++ b/src/core/include/execution_kernel.hpp @@ -17,356 +17,7 @@ #include #include "execution_common.hpp" - -namespace { -#if defined(MSCCLPP_DEVICE_COMPILE) -template -MSCCLPP_DEVICE_INLINE To bit_cast(const From& src) { - static_assert(sizeof(To) == sizeof(From), "Size mismatch for bit_cast"); - - union { - From f; - To t; - } u; - u.f = src; - return u.t; -} - -template -MSCCLPP_DEVICE_INLINE T add_elements(T a, T b) { - return a + b; -} - -template <> -MSCCLPP_DEVICE_INLINE __half2 add_elements(__half2 a, __half2 b) { - return __hadd2(a, b); -} - -template <> -MSCCLPP_DEVICE_INLINE __bfloat16 add_elements(__bfloat16 a, __bfloat16 b) { - return __hadd(a, b); -} - -template <> -MSCCLPP_DEVICE_INLINE __bfloat162 add_elements(__bfloat162 a, __bfloat162 b) { - return __hadd2(a, b); -} - -#if defined(__FP8_TYPES_EXIST__) -// FP8 E4M3 addition using __hadd (single element) -template <> -MSCCLPP_DEVICE_INLINE __fp8_e4m3 add_elements(__fp8_e4m3 a, __fp8_e4m3 b) { -#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) - // Optimized assembly for gfx942 - float2 v; - uint32_t ival = 0; - asm volatile("v_pk_add_f32 %0, %1, %2" - : "=v"(v) - : "v"(__builtin_amdgcn_cvt_pk_f32_fp8(a.__x, 0)), "v"(__builtin_amdgcn_cvt_pk_f32_fp8(b.__x, 0))); - return __builtin_amdgcn_cvt_pk_fp8_f32(v.x, v.x, ival, false); -#else - return __fp8_e4m3(__hadd(__half(a), __half(b))); -#endif -} - -// FP8 E5M2 addition using __hadd (single element) - must come before helper functions -template <> -MSCCLPP_DEVICE_INLINE __fp8_e5m2 add_elements(__fp8_e5m2 a, __fp8_e5m2 b) { -#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) - // Optimized assembly for gfx942 (bfloat8) - float2 v; - uint32_t ival = 0; - asm volatile("v_pk_add_f32 %0, %1, %2" - : "=v"(v) - : "v"(__builtin_amdgcn_cvt_pk_f32_bf8(a.__x, 0)), "v"(__builtin_amdgcn_cvt_pk_f32_bf8(b.__x, 0))); - return __builtin_amdgcn_cvt_pk_bf8_f32(v.x, v.x, ival, false); -#else - return __fp8_e5m2(__hadd(__half(a), __half(b))); -#endif -} - -#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) -// HIP gfx942 platform: Helper functions for vectorized FP8 operations -// We use separate function names because __fp8x2_e4m3 and __fp8x2_e5m2 are both uint16_t - -// E4M3 vectorized addition for 2 elements -MSCCLPP_DEVICE_INLINE uint16_t add_fp8x2_e4m3(uint16_t a, uint16_t b) { - float2 v; - uint32_t ival = 0; - asm volatile("v_pk_add_f32 %0, %1, %2" - : "=v"(v) - : "v"(__builtin_amdgcn_cvt_pk_f32_fp8(a, 0)), "v"(__builtin_amdgcn_cvt_pk_f32_fp8(b, 0))); - return __builtin_amdgcn_cvt_pk_fp8_f32(v.x, v.y, ival, false); -} - -// E4M3 vectorized addition for 4 elements -MSCCLPP_DEVICE_INLINE uint32_t add_fp8x4_e4m3(uint32_t a, uint32_t b) { - uint16_t a_low = a & 0xFFFF; - uint16_t a_high = (a >> 16) & 0xFFFF; - uint16_t b_low = b & 0xFFFF; - uint16_t b_high = (b >> 16) & 0xFFFF; - uint16_t result_low = add_fp8x2_e4m3(a_low, b_low); - uint16_t result_high = add_fp8x2_e4m3(a_high, b_high); - return (static_cast(result_high) << 16) | result_low; -} - -// E5M2 vectorized addition for 2 elements -MSCCLPP_DEVICE_INLINE uint16_t add_fp8x2_e5m2(uint16_t a, uint16_t b) { - float2 v; - uint32_t ival = 0; - asm volatile("v_pk_add_f32 %0, %1, %2" - : "=v"(v) - : "v"(__builtin_amdgcn_cvt_pk_f32_bf8(a, 0)), "v"(__builtin_amdgcn_cvt_pk_f32_bf8(b, 0))); - return __builtin_amdgcn_cvt_pk_bf8_f32(v.x, v.y, ival, false); -} - -// E5M2 vectorized addition for 4 elements -MSCCLPP_DEVICE_INLINE uint32_t add_fp8x4_e5m2(uint32_t a, uint32_t b) { - uint16_t a_low = a & 0xFFFF; - uint16_t a_high = (a >> 16) & 0xFFFF; - uint16_t b_low = b & 0xFFFF; - uint16_t b_high = (b >> 16) & 0xFFFF; - uint16_t result_low = add_fp8x2_e5m2(a_low, b_low); - uint16_t result_high = add_fp8x2_e5m2(a_high, b_high); - return (static_cast(result_high) << 16) | result_low; -} -#endif - -#if !defined(MSCCLPP_DEVICE_HIP) -// CUDA platform: Template specializations for vectorized FP8 operations - -// FP8 E4M3 vectorized addition using __hadd2 for 2 elements (CUDA only) -template <> -MSCCLPP_DEVICE_INLINE __fp8x2_e4m3 add_elements(__fp8x2_e4m3 a, __fp8x2_e4m3 b) { - return __fp8x2_e4m3(__hadd2(__half2(a), __half2(b))); -} - -// FP8 E4M3 vectorized addition for 4 elements (CUDA only - via 2x __fp8x2_e4m3) -template <> -MSCCLPP_DEVICE_INLINE __fp8x4_e4m3 add_elements(__fp8x4_e4m3 a, __fp8x4_e4m3 b) { - __fp8x2_e4m3* a_pair = reinterpret_cast<__fp8x2_e4m3*>(&a); - __fp8x2_e4m3* b_pair = reinterpret_cast<__fp8x2_e4m3*>(&b); - - __fp8x2_e4m3 result[2]; - result[0] = add_elements(a_pair[0], b_pair[0]); - result[1] = add_elements(a_pair[1], b_pair[1]); - - return *reinterpret_cast<__fp8x4_e4m3*>(result); -} - -// FP8 E5M2 vectorized addition for 2 elements (CUDA only) -template <> -MSCCLPP_DEVICE_INLINE __fp8x2_e5m2 add_elements(__fp8x2_e5m2 a, __fp8x2_e5m2 b) { - return __fp8x2_e5m2(__hadd2(__half2(a), __half2(b))); -} - -// FP8 E5M2 vectorized addition for 4 elements (CUDA only - via 2x __fp8x2_e5m2) -template <> -MSCCLPP_DEVICE_INLINE __fp8x4_e5m2 add_elements(__fp8x4_e5m2 a, __fp8x4_e5m2 b) { - __fp8x2_e5m2* a_pair = reinterpret_cast<__fp8x2_e5m2*>(&a); - __fp8x2_e5m2* b_pair = reinterpret_cast<__fp8x2_e5m2*>(&b); - - __fp8x2_e5m2 result[2]; - result[0] = add_elements(a_pair[0], b_pair[0]); - result[1] = add_elements(a_pair[1], b_pair[1]); - - return *reinterpret_cast<__fp8x4_e5m2*>(result); -} -#endif -#endif // __FP8_TYPES_EXIST__ - -template -MSCCLPP_DEVICE_INLINE int4 add_vectors_helper(int4 a, int4 b) { - int4 ret; - ret.w = bit_cast(add_elements(bit_cast(a.w), bit_cast(b.w))); - ret.x = bit_cast(add_elements(bit_cast(a.x), bit_cast(b.x))); - ret.y = bit_cast(add_elements(bit_cast(a.y), bit_cast(b.y))); - ret.z = bit_cast(add_elements(bit_cast(a.z), bit_cast(b.z))); - return ret; -} - -template -MSCCLPP_DEVICE_INLINE int4 add_vectors(int4 a, int4 b) { - return add_vectors_helper(a, b); -} - -template <> -MSCCLPP_DEVICE_INLINE int4 add_vectors<__half>(int4 a, int4 b) { - return add_vectors_helper<__half2>(a, b); -} - -template <> -MSCCLPP_DEVICE_INLINE int4 add_vectors<__bfloat16>(int4 a, int4 b) { - return add_vectors_helper<__bfloat162>(a, b); -} - -#if defined(__FP8_TYPES_EXIST__) -template <> -MSCCLPP_DEVICE_INLINE int4 add_vectors<__fp8_e4m3>(int4 a, int4 b) { -#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) - // HIP gfx942: Use helper functions that work with storage types - int4 ret; - ret.w = add_fp8x4_e4m3(a.w, b.w); - ret.x = add_fp8x4_e4m3(a.x, b.x); - ret.y = add_fp8x4_e4m3(a.y, b.y); - ret.z = add_fp8x4_e4m3(a.z, b.z); - return ret; -#else - return add_vectors_helper<__fp8x4_e4m3>(a, b); -#endif -} - -template <> -MSCCLPP_DEVICE_INLINE int4 add_vectors<__fp8_e5m2>(int4 a, int4 b) { -#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) - // HIP gfx942: Use helper functions that work with storage types - int4 ret; - ret.w = add_fp8x4_e5m2(a.w, b.w); - ret.x = add_fp8x4_e5m2(a.x, b.x); - ret.y = add_fp8x4_e5m2(a.y, b.y); - ret.z = add_fp8x4_e5m2(a.z, b.z); - return ret; -#else - return add_vectors_helper<__fp8x4_e5m2>(a, b); -#endif -} -#endif // __FP8_TYPES_EXIST__ - -template -MSCCLPP_DEVICE_INLINE uint2 add_vectors_helper(uint2 a, uint2 b) { - uint2 ret; - ret.x = bit_cast(add_elements(bit_cast(a.x), bit_cast(b.x))); - ret.y = bit_cast(add_elements(bit_cast(a.y), bit_cast(b.y))); - return ret; -} - -template -MSCCLPP_DEVICE_INLINE uint2 add_vectors(uint2 a, uint2 b) { - return add_vectors_helper(a, b); -} - -template <> -MSCCLPP_DEVICE_INLINE __attribute__((unused)) uint2 add_vectors<__half>(uint2 a, uint2 b) { - return add_vectors_helper<__half2>(a, b); -} - -template <> -MSCCLPP_DEVICE_INLINE __attribute__((unused)) uint2 add_vectors<__bfloat16>(uint2 a, uint2 b) { - return add_vectors_helper<__bfloat162>(a, b); -} - -#if defined(__FP8_TYPES_EXIST__) -template <> -MSCCLPP_DEVICE_INLINE __attribute__((unused)) uint2 add_vectors<__fp8_e4m3>(uint2 a, uint2 b) { -#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) - // HIP gfx942: Use helper functions that work with storage types - uint2 ret; - ret.x = add_fp8x4_e4m3(a.x, b.x); - ret.y = add_fp8x4_e4m3(a.y, b.y); - return ret; -#else - return add_vectors_helper<__fp8x4_e4m3>(a, b); -#endif -} - -template <> -MSCCLPP_DEVICE_INLINE __attribute__((unused)) uint2 add_vectors<__fp8_e5m2>(uint2 a, uint2 b) { -#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) - // HIP gfx942: Use helper functions that work with storage types - uint2 ret; - ret.x = add_fp8x4_e5m2(a.x, b.x); - ret.y = add_fp8x4_e5m2(a.y, b.y); - return ret; -#else - return add_vectors_helper<__fp8x4_e5m2>(a, b); -#endif -} -#endif // __FP8_TYPES_EXIST__ - -template -MSCCLPP_DEVICE_INLINE int add_vectors_helper(int a, int b) { - return bit_cast(add_elements(bit_cast(a), bit_cast(b))); -} - -template -MSCCLPP_DEVICE_INLINE int add_vectors(int a, int b) { - return add_vectors_helper(a, b); -} - -template <> -MSCCLPP_DEVICE_INLINE __attribute__((unused)) int add_vectors<__half>(int a, int b) { - return add_vectors_helper<__half2>(a, b); -} - -template <> -MSCCLPP_DEVICE_INLINE __attribute__((unused)) int add_vectors<__bfloat16>(int a, int b) { - return add_vectors_helper<__bfloat162>(a, b); -} - -#if defined(__FP8_TYPES_EXIST__) -template <> -MSCCLPP_DEVICE_INLINE __attribute__((unused)) int add_vectors<__fp8_e4m3>(int a, int b) { -#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) - return add_fp8x4_e4m3(a, b); -#else - return add_vectors_helper<__fp8x4_e4m3>(a, b); -#endif -} - -template <> -MSCCLPP_DEVICE_INLINE __attribute__((unused)) int add_vectors<__fp8_e5m2>(int a, int b) { -#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) - return add_fp8x4_e5m2(a, b); -#else - return add_vectors_helper<__fp8x4_e5m2>(a, b); -#endif -} -#endif // __FP8_TYPES_EXIST__ - -template -MSCCLPP_DEVICE_INLINE uint32_t add_vectors_helper(uint32_t a, uint32_t b) { - return bit_cast(add_elements(bit_cast(a), bit_cast(b))); -} - -template -MSCCLPP_DEVICE_INLINE uint32_t add_vectors(uint32_t a, uint32_t b) { - return add_vectors_helper(a, b); -} - -template <> -MSCCLPP_DEVICE_INLINE uint32_t add_vectors<__half>(uint32_t a, uint32_t b) { - return add_vectors_helper<__half2>(a, b); -} - -template <> -MSCCLPP_DEVICE_INLINE uint32_t add_vectors<__bfloat16>(uint32_t a, uint32_t b) { - return add_vectors_helper<__bfloat162>(a, b); -} - -#if defined(__FP8_TYPES_EXIST__) -template <> -MSCCLPP_DEVICE_INLINE uint32_t add_vectors<__fp8_e4m3>(uint32_t a, uint32_t b) { -#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) - return add_fp8x4_e4m3(a, b); -#else - return add_vectors_helper<__fp8x4_e4m3>(a, b); -#endif -} - -template <> -MSCCLPP_DEVICE_INLINE uint32_t add_vectors<__fp8_e5m2>(uint32_t a, uint32_t b) { -#if defined(MSCCLPP_DEVICE_HIP) && defined(__gfx942__) - return add_fp8x4_e5m2(a, b); -#else - return add_vectors_helper<__fp8x4_e5m2>(a, b); -#endif -} -#endif // __FP8_TYPES_EXIST__ - -#endif // MSCCLPP_DEVICE_COMPILE - -} // namespace - +#include "reduce_kernel.hpp" namespace mscclpp { #if defined(MSCCLPP_DEVICE_COMPILE) @@ -534,7 +185,7 @@ MSCCLPP_DEVICE_INLINE void handlePut(const Operation& op, void* input, void* out } } -template +template MSCCLPP_DEVICE_INLINE void handleReadReduceSend(const Operation& op, void* input, void* output, void* scratch, uint32_t offset, uint32_t unitSize) { const uint32_t size = min(op.inputBufferSizes[0] - offset, unitSize); @@ -559,7 +210,7 @@ MSCCLPP_DEVICE_INLINE void handleReadReduceSend(const Operation& op, void* input sizeof(int4); void* remoteMemory = static_cast(memoryChannelBufferPtrs_[op.inputBufferRefs[index + 1].id]); val = mscclpp::read(remoteMemory, srcOffset + idx); - tmp = add_vectors(tmp, val); + tmp = cal_vector(tmp, val); } output4[outputOffset4 + idx] = tmp; if constexpr (SendToRemote) { @@ -587,7 +238,7 @@ MSCCLPP_DEVICE_INLINE void handleReadReduceSend(const Operation& op, void* input getOffset(memoryChannelBufferTypes_[op.inputBufferRefs[index + 1].id], offset)) / sizeof(T); void* remoteMemory = static_cast(memoryChannelBufferPtrs_[op.inputBufferRefs[index + 1].id]); - tmp = add_elements(tmp, mscclpp::read(remoteMemory, srcOffset + idx)); + tmp = tmp + mscclpp::read(remoteMemory, srcOffset + idx); } static_cast(output)[idx] = tmp; if constexpr (SendToRemote) { @@ -681,7 +332,7 @@ MSCCLPP_DEVICE_INLINE void handleReadPutPackets(const Operation& op, void* scrat } } -template +template MSCCLPP_DEVICE_INLINE void handleReduceSendPackets(const Operation& op, void* input, void* output, void* scratch) { uint32_t size = op.inputBufferSizes[0]; const uint32_t nSrcs = op.nInputs - 1; @@ -704,9 +355,9 @@ MSCCLPP_DEVICE_INLINE void handleReduceSendPackets(const Operation& op, void* in for (uint32_t index = 0; index < nSrcs; ++index) { PacketType* pkt = (PacketType*)((char*)scratch + scratchOffset_ + 2 * inputOffsets[index]); PacketPayload val = pkt[idx].read(flag_); - data = add_vectors(data, val); + data = cal_vector(data, val); } - data = add_vectors(data, srcPacketPayload[idx]); + data = cal_vector(data, srcPacketPayload[idx]); dstPacketPayload[idx] = data; if constexpr (SendToRemote) { @@ -720,7 +371,7 @@ MSCCLPP_DEVICE_INLINE void handleReduceSendPackets(const Operation& op, void* in } } -template +template MSCCLPP_DEVICE_INLINE void handleReduceCopySendPackets(const Operation& op, void* input, void* output, void* scratch) { uint32_t size = op.inputBufferSizes[0]; const uint32_t nSrcs = op.nInputs - 1; @@ -745,9 +396,9 @@ MSCCLPP_DEVICE_INLINE void handleReduceCopySendPackets(const Operation& op, void for (uint32_t index = 0; index < nSrcs; ++index) { PacketType* pkt = (PacketType*)((char*)scratch + scratchOffset_ + 2 * inputOffsets[index]); PacketPayload val = pkt[idx].read(flag_); - data = add_vectors(data, val); + data = cal_vector(data, val); } - data = add_vectors(data, srcPacketPayload[idx]); + data = cal_vector(data, srcPacketPayload[idx]); dstPacketPayload[idx] = data; PacketType* dst_val = &dstPkt[idx]; dst_val->write(data, flag_); @@ -790,7 +441,7 @@ MSCCLPP_DEVICE_INLINE void handleCopyPackets(const Operation& op, void* input, v mscclpp::copyToPackets(dst, src, size, threadIdx.x, blockDim.x, flag_); } -template +template MSCCLPP_DEVICE_INLINE void handleReduceSend(const Operation& op, void* input, void* output, void* scratch, uint32_t offset, uint32_t unitSize) { const uint32_t size = min(op.inputBufferSizes[0] - offset, unitSize); @@ -815,7 +466,7 @@ MSCCLPP_DEVICE_INLINE void handleReduceSend(const Operation& op, void* input, vo size_t buffOffset = (inputOffsets[index] + getOffset(outputBufferRefs[index].type, offset)) / sizeof(int4); int4 val = buff4[buffOffset + idx]; - tmp = add_vectors(tmp, val); + tmp = cal_vector(tmp, val); } dst4[dstOffset4 + idx] = tmp; if constexpr (SendToRemote) { @@ -840,7 +491,7 @@ MSCCLPP_DEVICE_INLINE void handleReduceSend(const Operation& op, void* input, vo T* buff = static_cast(getBuffer(input, output, scratch, inputBufferRefs[index].type)); uint32_t buffOffset = (inputOffsets[index] + getOffset(inputBufferRefs[index].type, offset)) / sizeof(T); - tmp = add_elements(tmp, buff[buffOffset + idx]); + tmp = tmp + buff[buffOffset + idx]; } dst[idx] = tmp; if constexpr (SendToRemote) { @@ -897,7 +548,7 @@ MSCCLPP_DEVICE_INLINE void handleMultiLoadReduceStore(const Operation& op, uint3 } } else { // handle data in 16-byte unit - using Type16 = typename mscclpp::VectorType; + using Type16 = mscclpp::VectorType; const size_t nType16 = size / sizeof(Type16); const size_t srcOffset16 = srcOffset / sizeof(Type16); const size_t dstOffset16 = dstOffset / sizeof(Type16); @@ -909,7 +560,7 @@ MSCCLPP_DEVICE_INLINE void handleMultiLoadReduceStore(const Operation& op, uint3 } // handle rest of data constexpr int RedBytes = (sizeof(T) == 8) ? 8 : 4; - using TypeRest = typename mscclpp::VectorType; + using TypeRest = mscclpp::VectorType; const size_t processed = nType16 * sizeof(Type16); const size_t nRest = (size - processed) / sizeof(TypeRest); TypeRest* srcR = reinterpret_cast(src + srcOffset + processed); diff --git a/src/core/include/reduce_kernel.hpp b/src/core/include/reduce_kernel.hpp new file mode 100644 index 00000000..00dc7714 --- /dev/null +++ b/src/core/include/reduce_kernel.hpp @@ -0,0 +1,81 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +#ifndef MSCCLPP_REDUCE_KERNEL_HPP_ +#define MSCCLPP_REDUCE_KERNEL_HPP_ + +#include +#include +#include + +namespace mscclpp { + +#if defined(MSCCLPP_DEVICE_COMPILE) + +// Generic element-wise calculation helper +template +MSCCLPP_DEVICE_INLINE T cal_elements(const T& a, const T& b) { + if constexpr (OpType == SUM) { + return a + b; + } else if constexpr (OpType == MIN) { + return mscclpp::min(a, b); + } + static_assert(OpType == SUM || OpType == MIN, "Unsupported ReduceOp"); +} + +// Generic vector reduction helpers +template +MSCCLPP_DEVICE_INLINE int4 cal_vector_helper(const int4& a, const int4& b) { + int4 ret; + ret.w = bit_cast(cal_elements(bit_cast(a.w), bit_cast(b.w))); + ret.x = bit_cast(cal_elements(bit_cast(a.x), bit_cast(b.x))); + ret.y = bit_cast(cal_elements(bit_cast(a.y), bit_cast(b.y))); + ret.z = bit_cast(cal_elements(bit_cast(a.z), bit_cast(b.z))); + return ret; +} + +template +MSCCLPP_DEVICE_INLINE uint2 cal_vector_helper(const uint2& a, const uint2& b) { + uint2 ret; + ret.x = bit_cast(cal_elements(bit_cast(a.x), bit_cast(b.x))); + ret.y = bit_cast(cal_elements(bit_cast(a.y), bit_cast(b.y))); + return ret; +} + +template +MSCCLPP_DEVICE_INLINE int cal_vector_helper(const int& a, const int& b) { + return bit_cast(cal_elements(bit_cast(a), bit_cast(b))); +} + +template +MSCCLPP_DEVICE_INLINE uint32_t cal_vector_helper(const uint32_t& a, const uint32_t& b) { + return bit_cast(cal_elements(bit_cast(a), bit_cast(b))); +} + +// cal_vector wrapper - converts scalar types to vector types and calls cal_vector_helper +template +MSCCLPP_DEVICE_INLINE DataType cal_vector(const DataType& a, const DataType& b) { + // Define the vectorized computation type based on the element type + static_assert(sizeof(DataType) % sizeof(T) == 0, "DataType size must be multiple of T size"); + static_assert(sizeof(DataType) >= 4, "DataType size must be at least 4 bytes"); + using CompType = typename std::conditional_t< + std::is_same_v, f16x2, + std::conditional_t, bf16x2, +#if defined(__FP8_TYPES_EXIST__) + std::conditional_t, f8_e4m3x4, + std::conditional_t, f8_e5m2x4, +#endif + T +#if defined(__FP8_TYPES_EXIST__) + >>>>; +#else + >>; +#endif + return cal_vector_helper(a, b); +} + +#endif // defined(MSCCLPP_DEVICE_COMPILE) + +} // namespace mscclpp + +#endif // MSCCLPP_REDUCE_KERNEL_HPP_ diff --git a/src/ext/collectives/allreduce/allreduce_allpair_packet.cu b/src/ext/collectives/allreduce/allreduce_allpair_packet.cu index a4881093..f6081043 100644 --- a/src/ext/collectives/allreduce/allreduce_allpair_packet.cu +++ b/src/ext/collectives/allreduce/allreduce_allpair_packet.cu @@ -58,7 +58,7 @@ __global__ void allreduceAllPairs(T* buff, T* scratch, T* resultBuff, DeviceHand const int remoteRank = index < rank ? index : index + 1; LL8Packet* dstPkt = (LL8Packet*)scratchBuff + remoteRank * nelems; uint32_t val = dstPkt[idx].read(flag, -1); - data = cal_vectors(val, data); + data = cal_vector(val, data); } dst[idx] = data; } diff --git a/src/ext/collectives/allreduce/allreduce_fullmesh.cu b/src/ext/collectives/allreduce/allreduce_fullmesh.cu index e8cd93bb..d04766c1 100644 --- a/src/ext/collectives/allreduce/allreduce_fullmesh.cu +++ b/src/ext/collectives/allreduce/allreduce_fullmesh.cu @@ -85,7 +85,7 @@ __global__ void __launch_bounds__(512, 1) for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) { const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1; int4 val = scratch4[chunkSizePerRank * remoteRank + blockOffset + idx]; - data = cal_vectors(val, data); + data = cal_vector(val, data); } resultBuff4[nInt4PerRank * rank + idx + offsetOfThisBlock] = data; for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) { @@ -125,7 +125,7 @@ __global__ void __launch_bounds__(512, 1) for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) { const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1; int4 val = scratch4[chunkSizePerRank * remoteRank + blockOffset + idx]; - data = cal_vectors(val, data); + data = cal_vector(val, data); } resultBuff4[nInt4PerRank * rank + idx + offsetOfThisBlock] = data; for (int peerIdx = 0; peerIdx < nPeer; peerIdx++) { diff --git a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu index aafe7566..bc7d596a 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu @@ -47,7 +47,7 @@ __global__ void __launch_bounds__(1024, 1) continue; } uint val = scratchPkt[peer * worldSize * nPktPerRank + i].read(flag); - data = cal_vectors(data, val); + data = cal_vector(data, val); } dst[i] = data; } diff --git a/src/ext/collectives/allreduce/allreduce_packet.cu b/src/ext/collectives/allreduce/allreduce_packet.cu index d150c717..23ed5d09 100644 --- a/src/ext/collectives/allreduce/allreduce_packet.cu +++ b/src/ext/collectives/allreduce/allreduce_packet.cu @@ -102,8 +102,8 @@ __global__ void __launch_bounds__(1024, 1) const int remoteRank = index < rank ? index : index + 1; mscclpp::LLPacket* dstPkt = (mscclpp::LLPacket*)scratchBuff + remoteRank * nPktsPerRank; uint2 val = dstPkt[idx].read(flag); - data.x = cal_vectors(val.x, data.x); - data.y = cal_vectors(val.y, data.y); + data.x = cal_vector(val.x, data.x); + data.y = cal_vector(val.y, data.y); } dst[idx].x = data.x; diff --git a/src/ext/collectives/include/allreduce/common.hpp b/src/ext/collectives/include/allreduce/common.hpp index 10eecf7e..26b57dbf 100644 --- a/src/ext/collectives/include/allreduce/common.hpp +++ b/src/ext/collectives/include/allreduce/common.hpp @@ -10,6 +10,8 @@ #include #include +#include "reduce_kernel.hpp" + #if defined(ENABLE_NPKIT) #include #endif @@ -22,438 +24,6 @@ constexpr ReduceOp MIN = ReduceOp::MIN; #if defined(MSCCLPP_DEVICE_COMPILE) -template -__forceinline__ __device__ To bit_cast(const From& src) { - static_assert(sizeof(To) == sizeof(From), "Size mismatch for bit_cast"); - - union { - From f; - To t; - } u; - u.f = src; - return u.t; -} - -template -__forceinline__ __device__ T clip(T val) { - return val; -} - -template <> -__forceinline__ __device__ __half clip(__half val) { - val = __hmax(val, bit_cast<__half, unsigned short>(0xfbff)); - val = __hmin(val, bit_cast<__half, unsigned short>(0x7bff)); - - return val; -} - -template <> -__forceinline__ __device__ __half2 clip(__half2 val) { - val.x = __hmax(val.x, bit_cast<__half, unsigned short>(0xfbff)); - val.x = __hmin(val.x, bit_cast<__half, unsigned short>(0x7bff)); - val.y = __hmax(val.y, bit_cast<__half, unsigned short>(0xfbff)); - val.y = __hmin(val.y, bit_cast<__half, unsigned short>(0x7bff)); - return val; -} - -template <> -__forceinline__ __device__ __bfloat16 clip(__bfloat16 val) { - val = __hmax(val, bit_cast<__bfloat16, unsigned short>(0xff80)); - val = __hmin(val, bit_cast<__bfloat16, unsigned short>(0x7f80)); - return val; -} - -template <> -__forceinline__ __device__ __bfloat162 clip(__bfloat162 val) { - val.x = __hmax(val.x, bit_cast<__bfloat16, unsigned short>(0xff80)); - val.x = __hmin(val.x, bit_cast<__bfloat16, unsigned short>(0x7f80)); - val.y = __hmax(val.y, bit_cast<__bfloat16, unsigned short>(0xff80)); - val.y = __hmin(val.y, bit_cast<__bfloat16, unsigned short>(0x7f80)); - return val; -} - -template -__forceinline__ __device__ T add_elements(T a, T b) { - if constexpr (UseClip) { - return clip(a + b); - } else { - return a + b; - } -} - -template -__forceinline__ __device__ __half2 add_elements(__half2 a, __half2 b) { - if constexpr (UseClip) { - return clip(__hadd2(a, b)); - } else { - return __hadd2(a, b); - } -} - -template -__forceinline__ __device__ __bfloat162 add_elements(__bfloat162 a, __bfloat162 b) { - if constexpr (UseClip) { - return clip(__hadd2(a, b)); - } else { - return __hadd2(a, b); - } -} - -template -__forceinline__ __device__ T min_elements(T a, T b) { - return (a < b ? a : b); -} - -template <> -__forceinline__ __device__ __half2 min_elements(__half2 a, __half2 b) { -#if defined(__HIP_PLATFORM_AMD__) - __half2 val; - val.x = __hmin(a.x, b.x); - val.y = __hmin(a.y, b.y); - return val; -#else - return __hmin2(a, b); -#endif -} - -template <> -__forceinline__ __device__ __bfloat162 min_elements(__bfloat162 a, __bfloat162 b) { - return __hmin2(a, b); -} - -#if defined(__FP8_TYPES_EXIST__) -// FP8 E4M3 clipping function -template <> -__forceinline__ __device__ __fp8_e4m3 clip(__fp8_e4m3 val) { - // FP8 E4M3 has range [-448, 448], no infinities - // Built-in saturation in FP8 arithmetic - return val; -} - -// FP8 E5M2 clipping function - prevent infinities by clamping to max finite value -template <> -__forceinline__ __device__ __fp8_e5m2 clip(__fp8_e5m2 val) { - // FP8 E5M2 has infinities - clamp to max finite value to prevent overflow - // Max finite value for E5M2 is 57344.0f (0x7B), min is -57344.0f (0xFB) - float fval = float(val); - fval = fmaxf(fval, -57344.0f); - fval = fminf(fval, 57344.0f); - return __fp8_e5m2(fval); -} - -// FP8 E4M3 addition using __hadd for efficiency (single element) -template -__forceinline__ __device__ __fp8_e4m3 add_elements(__fp8_e4m3 a, __fp8_e4m3 b) { -#if defined(__HIP_PLATFORM_AMD__) && defined(__gfx942__) - // Optimized assembly for gfx942 - float2 v; - uint32_t ival = 0; - asm volatile("v_pk_add_f32 %0, %1, %2" - : "=v"(v) - : "v"(__builtin_amdgcn_cvt_pk_f32_fp8(a.__x, 0)), "v"(__builtin_amdgcn_cvt_pk_f32_fp8(b.__x, 0))); - return __builtin_amdgcn_cvt_pk_fp8_f32(v.x, v.x, ival, false); -#elif !defined(__HIP_PLATFORM_AMD__) - // NVIDIA CUDA FP8 addition (CUDA 11.8+) - __fp8_e4m3 result = __fp8_e4m3(__hadd(__half(a), __half(b))); - return UseClip ? clip(result) : result; -#else - // Fallback for non-gfx942 HIP platforms - __fp8_e4m3 result = __fp8_e4m3(float(a) + float(b)); - return UseClip ? clip(result) : result; -#endif -} - -// FP8 E4M3 vectorized addition for 2 elements -template -__forceinline__ __device__ __fp8x2_e4m3 add_elements(__fp8x2_e4m3 a, __fp8x2_e4m3 b) { -#if defined(__HIP_PLATFORM_AMD__) && defined(__gfx942__) - float2 v; - uint32_t ival = 0; - asm volatile("v_pk_add_f32 %0, %1, %2" - : "=v"(v) - : "v"(__builtin_amdgcn_cvt_pk_f32_fp8(a, 0)), "v"(__builtin_amdgcn_cvt_pk_f32_fp8(b, 0))); - return __builtin_amdgcn_cvt_pk_fp8_f32(v.x, v.y, ival, false); -#elif !defined(__HIP_PLATFORM_AMD__) - // CUDA: Convert to half2, add using optimized __hadd2, convert back - __fp8x2_e4m3 result = __fp8x2_e4m3(__hadd2(__half2(a), __half2(b))); - return result; -#else - // Fallback for non-gfx942 HIP: element-wise using single-element operations - union { - __fp8_e4m3 fp8[2]; - __fp8x2_e4m3 fp8x2; - } ua, ub, result; - ua.fp8x2 = a; - ub.fp8x2 = b; - result.fp8[0] = add_elements(ua.fp8[0], ub.fp8[0]); - result.fp8[1] = add_elements(ua.fp8[1], ub.fp8[1]); - return result.fp8x2; -#endif -} - -// FP8 E4M3 vectorized addition for 4 elements (via 2x __fp8x2_e4m3) -template -__forceinline__ __device__ __fp8x4_e4m3 add_elements(__fp8x4_e4m3 a, __fp8x4_e4m3 b) { - // Process as two __fp8x2_e4m3 using add_elements for 2 elements - __fp8x2_e4m3* a_pair = reinterpret_cast<__fp8x2_e4m3*>(&a); - __fp8x2_e4m3* b_pair = reinterpret_cast<__fp8x2_e4m3*>(&b); - - __fp8x2_e4m3 result[2]; - result[0] = add_elements(a_pair[0], b_pair[0]); - result[1] = add_elements(a_pair[1], b_pair[1]); - - return *reinterpret_cast<__fp8x4_e4m3*>(result); -} - -// FP8 E5M2 addition using __hadd for efficiency (single element) -template -__forceinline__ __device__ __fp8_e5m2 add_elements(__fp8_e5m2 a, __fp8_e5m2 b) { -#if defined(__HIP_PLATFORM_AMD__) && defined(__gfx942__) - // Optimized assembly for gfx942 (bfloat8) - float2 v; - uint32_t ival = 0; - asm volatile("v_pk_add_f32 %0, %1, %2" - : "=v"(v) - : "v"(__builtin_amdgcn_cvt_pk_f32_bf8(a.__x, 0)), "v"(__builtin_amdgcn_cvt_pk_f32_bf8(b.__x, 0))); - return __builtin_amdgcn_cvt_pk_bf8_f32(v.x, v.x, ival, false); -#elif !defined(__HIP_PLATFORM_AMD__) - // NVIDIA CUDA FP8 addition - __fp8_e5m2 result = __fp8_e5m2(__hadd(__half(a), __half(b))); - return UseClip ? clip(result) : result; -#else - // Fallback for non-gfx942 HIP platforms - __fp8_e5m2 result = __fp8_e5m2(float(a) + float(b)); - return UseClip ? clip(result) : result; -#endif -} - -#if !defined(__HIP_PLATFORM_AMD__) -// FP8 E5M2 vectorized addition for 2 elements (CUDA only) -template -__forceinline__ __device__ __fp8x2_e5m2 add_elements(__fp8x2_e5m2 a, __fp8x2_e5m2 b) { - // CUDA: Convert to half2, add using optimized __hadd2, convert back - __fp8x2_e5m2 result = __fp8x2_e5m2(__hadd2(__half2(a), __half2(b))); - return result; -} - -// FP8 E5M2 vectorized addition for 4 elements (CUDA only - via 2x __fp8x2_e5m2) -template -__forceinline__ __device__ __fp8x4_e5m2 add_elements(__fp8x4_e5m2 a, __fp8x4_e5m2 b) { - // Process as two __fp8x2_e5m2 using add_elements for 2 elements - __fp8x2_e5m2* a_pair = reinterpret_cast<__fp8x2_e5m2*>(&a); - __fp8x2_e5m2* b_pair = reinterpret_cast<__fp8x2_e5m2*>(&b); - - __fp8x2_e5m2 result[2]; - result[0] = add_elements(a_pair[0], b_pair[0]); - result[1] = add_elements(a_pair[1], b_pair[1]); - - return *reinterpret_cast<__fp8x4_e5m2*>(result); -} -#endif // !defined(__HIP_PLATFORM_AMD__) - -// FP8 E4M3 min operation (single element) -template <> -__forceinline__ __device__ __fp8_e4m3 min_elements(__fp8_e4m3 a, __fp8_e4m3 b) { -#if defined(__HIP_PLATFORM_AMD__) - return __fp8_e4m3(fminf(float(a), float(b))); -#else - return __fp8_e4m3(__hmin(__half(a), __half(b))); -#endif -} - -// FP8 E4M3 vectorized min for 2 elements -__forceinline__ __device__ __fp8x2_e4m3 min_elements(__fp8x2_e4m3 a, __fp8x2_e4m3 b) { -#if defined(__HIP_PLATFORM_AMD__) - // HIP implementation: use union and process element-wise - union { - __fp8_e4m3 fp8[2]; - __fp8x2_e4m3 fp8x2; - } ua, ub, result; - ua.fp8x2 = a; - ub.fp8x2 = b; - result.fp8[0] = min_elements(ua.fp8[0], ub.fp8[0]); - result.fp8[1] = min_elements(ua.fp8[1], ub.fp8[1]); - return result.fp8x2; -#else - return __fp8x2_e4m3(__hmin2(__half2(a), __half2(b))); -#endif -} - -// FP8 E4M3 vectorized min for 4 elements -__forceinline__ __device__ __fp8x4_e4m3 min_elements(__fp8x4_e4m3 a, __fp8x4_e4m3 b) { - // Process as two __fp8x2_e4m3 using min_elements for 2 elements - union { - __fp8x4_e4m3 vec4; - __fp8x2_e4m3 vec2[2]; - } ua, ub, uresult; - ua.vec4 = a; - ub.vec4 = b; - - uresult.vec2[0] = min_elements(ua.vec2[0], ub.vec2[0]); - uresult.vec2[1] = min_elements(ua.vec2[1], ub.vec2[1]); - - return uresult.vec4; -} - -// FP8 E5M2 min operation (single element) -template <> -__forceinline__ __device__ __fp8_e5m2 min_elements(__fp8_e5m2 a, __fp8_e5m2 b) { -#if defined(__HIP_PLATFORM_AMD__) - return __fp8_e5m2(fminf(float(a), float(b))); -#else - return __fp8_e5m2(__hmin(__half(a), __half(b))); -#endif -} - -#if !defined(__HIP_PLATFORM_AMD__) -// FP8 E5M2 vectorized min for 2 elements (CUDA only) -__forceinline__ __device__ __fp8x2_e5m2 min_elements(__fp8x2_e5m2 a, __fp8x2_e5m2 b) { - return __fp8x2_e5m2(__hmin2(__half2(a), __half2(b))); -} - -// FP8 E5M2 vectorized min for 4 elements (CUDA only) -__forceinline__ __device__ __fp8x4_e5m2 min_elements(__fp8x4_e5m2 a, __fp8x4_e5m2 b) { - // Process as two __fp8x2_e5m2 using min_elements for 2 elements - union { - __fp8x4_e5m2 vec4; - __fp8x2_e5m2 vec2[2]; - } ua, ub, uresult; - ua.vec4 = a; - ub.vec4 = b; - - uresult.vec2[0] = min_elements(ua.vec2[0], ub.vec2[0]); - uresult.vec2[1] = min_elements(ua.vec2[1], ub.vec2[1]); - - return uresult.vec4; -} -#endif // !defined(__HIP_PLATFORM_AMD__) -#endif // __FP8_TYPES_EXIST__ - -template -__forceinline__ __device__ T cal_elements(T a, T b) { - if constexpr (OpType == SUM) { - return add_elements(a, b); - } else if constexpr (OpType == MIN) { - return min_elements(a, b); - } - // Should never reach here - return a; -} - -template -__forceinline__ __device__ int4 cal_vectors_helper(int4 a, int4 b) { - int4 ret; - ret.w = bit_cast(cal_elements(bit_cast(a.w), bit_cast(b.w))); - ret.x = bit_cast(cal_elements(bit_cast(a.x), bit_cast(b.x))); - ret.y = bit_cast(cal_elements(bit_cast(a.y), bit_cast(b.y))); - ret.z = bit_cast(cal_elements(bit_cast(a.z), bit_cast(b.z))); - return ret; -} - -template -__forceinline__ __device__ uint2 cal_vectors_helper(uint2 a, uint2 b) { - uint2 ret; - ret.x = bit_cast(cal_elements(bit_cast(a.x), bit_cast(b.x))); - ret.y = bit_cast(cal_elements(bit_cast(a.y), bit_cast(b.y))); - return ret; -} - -template -__forceinline__ __device__ int cal_vectors_helper(int a, int b) { - return bit_cast(cal_elements(bit_cast(a), bit_cast(b))); -} - -#if defined(__HIP_PLATFORM_AMD__) && defined(__FP8_TYPES_EXIST__) && defined(__gfx942__) -// Helper function to perform FP8 vector addition - dispatches based on scalar type -// Uses AMD builtins from hip/amd_detail/amd_hip_fp8.h: -// - __builtin_amdgcn_cvt_pk_f32_fp8/bf8: Convert 2 FP8 values to 2 floats -// - __builtin_amdgcn_cvt_pk_fp8/bf8_f32: Convert 2 floats to 2 FP8 values -// The 'word' parameter (false/true) selects low/high 16-bit word from uint32_t -template -__forceinline__ __device__ int add_fp8x4_hip(int a, int b) { - uint32_t a32 = static_cast(a); - uint32_t b32 = static_cast(b); - - float2 v_low, v_high; - uint32_t ival = 0; - - if constexpr (std::is_same_v) { - // E4M3 using fp8 conversion - process low word (false) and high word (true) - asm volatile("v_pk_add_f32 %0, %1, %2" - : "=v"(v_low) - : "v"(__builtin_amdgcn_cvt_pk_f32_fp8(a32, false)), "v"(__builtin_amdgcn_cvt_pk_f32_fp8(b32, false))); - uint16_t result_low = __builtin_amdgcn_cvt_pk_fp8_f32(v_low.x, v_low.y, ival, false); - - asm volatile("v_pk_add_f32 %0, %1, %2" - : "=v"(v_high) - : "v"(__builtin_amdgcn_cvt_pk_f32_fp8(a32, true)), "v"(__builtin_amdgcn_cvt_pk_f32_fp8(b32, true))); - uint16_t result_high = __builtin_amdgcn_cvt_pk_fp8_f32(v_high.x, v_high.y, ival, false); - - uint32_t result = (static_cast(result_high) << 16) | result_low; - return static_cast(result); - } else { // __fp8_e5m2 - // E5M2 using bf8 conversion - process low word (false) and high word (true) - asm volatile("v_pk_add_f32 %0, %1, %2" - : "=v"(v_low) - : "v"(__builtin_amdgcn_cvt_pk_f32_bf8(a32, false)), "v"(__builtin_amdgcn_cvt_pk_f32_bf8(b32, false))); - uint16_t result_low = __builtin_amdgcn_cvt_pk_bf8_f32(v_low.x, v_low.y, ival, false); - - asm volatile("v_pk_add_f32 %0, %1, %2" - : "=v"(v_high) - : "v"(__builtin_amdgcn_cvt_pk_f32_bf8(a32, true)), "v"(__builtin_amdgcn_cvt_pk_f32_bf8(b32, true))); - uint16_t result_high = __builtin_amdgcn_cvt_pk_bf8_f32(v_high.x, v_high.y, ival, false); - - uint32_t result = (static_cast(result_high) << 16) | result_low; - return static_cast(result); - } -} -#endif - -template -__forceinline__ __device__ DataType cal_vectors(DataType a, DataType b) { -#if defined(__HIP_PLATFORM_AMD__) && defined(__FP8_TYPES_EXIST__) && defined(__gfx942__) - // For FP8 types on HIP gfx942, use specialized helper that dispatches based on scalar type - if constexpr (std::is_same_v || std::is_same_v) { - if constexpr (OpType == SUM) { - if constexpr (std::is_same_v || std::is_same_v) { - // Handle int/uint32_t (4 FP8 elements) - return add_fp8x4_hip(a, b); - } else if constexpr (std::is_same_v) { - // Handle int4 (16 FP8 elements) - process as 4 ints - int4 ret; - ret.w = add_fp8x4_hip(a.w, b.w); - ret.x = add_fp8x4_hip(a.x, b.x); - ret.y = add_fp8x4_hip(a.y, b.y); - ret.z = add_fp8x4_hip(a.z, b.z); - return ret; - } else if constexpr (std::is_same_v) { - // Handle uint2 (8 FP8 elements) - process as 2 ints - uint2 ret; - ret.x = add_fp8x4_hip(a.x, b.x); - ret.y = add_fp8x4_hip(a.y, b.y); - return ret; - } - } - } -#endif - - // Define the vectorized computation type based on the element type - using CompType = typename std::conditional_t< - std::is_same_v, __half2, - std::conditional_t, __bfloat162, -#if defined(__FP8_TYPES_EXIST__) - std::conditional_t, __fp8x4_e4m3, - std::conditional_t, __fp8x4_e5m2, -#endif - T -#if defined(__FP8_TYPES_EXIST__) - >>>>; -#else - >>; -#endif - return cal_vectors_helper(a, b); -} - #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 template MSCCLPP_DEVICE_INLINE constexpr std::size_t calcVectorSize() { @@ -472,7 +42,12 @@ MSCCLPP_DEVICE_INLINE void handleMultiLoadReduceStore(T* src, T* dst, size_t src // nvls can only handle 4 bytes alignment MSCCLPP_ASSERT_DEVICE(size % 4 == 0, "size must be 4 bytes aligned"); constexpr size_t nElem = calcVectorSize(); - using vectorType = mscclpp::VectorType; + // For integer types, use 1-element vectors since multimem doesn't support vectorized integer operations + constexpr size_t vecSize = (std::is_same_v || std::is_same_v || std::is_same_v || + std::is_same_v) + ? 1 + : nElem; + using vectorType = mscclpp::VectorType; const size_t nVec = size / sizeof(vectorType); const size_t srcOffset4 = srcOffset / sizeof(vectorType); const size_t dstOffset4 = dstOffset / sizeof(vectorType); From 620378b4fb3c9180dc4259d918b1b769d04d6d73 Mon Sep 17 00:00:00 2001 From: Qinghua Zhou Date: Fri, 6 Feb 2026 01:25:12 +0800 Subject: [PATCH 05/52] Fix cpplint error in main branch (#740) Fix the legacy cpplint error in main branch. --------- Co-authored-by: Qinghua Zhou Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Binyang Li --- .../01-basic-concepts/gpu_ping_pong.cu | 14 +++++----- .../02-bootstrap/gpu_ping_pong_mp.cu | 12 ++++---- .../03-memory-channel/bidir_memory_channel.cu | 28 +++++++++---------- .../04-port-channel/bidir_port_channel.cu | 14 +++++----- include/mscclpp/assert_device.hpp | 8 +++--- python/csrc/error_py.cpp | 8 +++--- python/csrc/npkit_py.cpp | 2 +- python/csrc/numa_py.cpp | 2 +- src/core/context.cc | 10 +++---- src/core/include/context.hpp | 6 ++-- src/core/include/gpu_ipc_mem.hpp | 10 +++---- src/core/include/ibverbs_wrapper.hpp | 22 +++++++-------- src/ext/nccl/audit-shim/audit_nccl.cc | 8 +++--- test/unit/local_channel_tests.cu | 6 ++-- 14 files changed, 75 insertions(+), 75 deletions(-) diff --git a/examples/tutorials/01-basic-concepts/gpu_ping_pong.cu b/examples/tutorials/01-basic-concepts/gpu_ping_pong.cu index 0e2ab5ad..f3c69b72 100644 --- a/examples/tutorials/01-basic-concepts/gpu_ping_pong.cu +++ b/examples/tutorials/01-basic-concepts/gpu_ping_pong.cu @@ -9,7 +9,7 @@ #include template -void log(Args &&...args) { +void log(Args&&... args) { std::stringstream ss; (ss << ... << args); ss << std::endl; @@ -23,7 +23,7 @@ __device__ void spin_cycles(unsigned long long cycles) { } } -__global__ void gpuKernel0(mscclpp::BaseMemoryChannelDeviceHandle *devHandle, int iter) { +__global__ void gpuKernel0(mscclpp::BaseMemoryChannelDeviceHandle* devHandle, int iter) { if (threadIdx.x + blockIdx.x * blockDim.x == 0) { for (int i = 0; i < iter; ++i) { devHandle->relaxedWait(); @@ -34,7 +34,7 @@ __global__ void gpuKernel0(mscclpp::BaseMemoryChannelDeviceHandle *devHandle, in } } -__global__ void gpuKernel1(mscclpp::BaseMemoryChannelDeviceHandle *devHandle, int iter) { +__global__ void gpuKernel1(mscclpp::BaseMemoryChannelDeviceHandle* devHandle, int iter) { if (threadIdx.x + blockIdx.x * blockDim.x == 0) { for (int i = 0; i < iter; ++i) { devHandle->relaxedSignal(); @@ -88,7 +88,7 @@ int main() { mscclpp::Semaphore sema0(/*localSemaphoreStub*/ semaStub0, /*remoteSemaphoreStub*/ semaStub1); mscclpp::BaseMemoryChannel memChan0(sema0); mscclpp::BaseMemoryChannelDeviceHandle memChanHandle0 = memChan0.deviceHandle(); - void *devHandle0; + void* devHandle0; MSCCLPP_CUDATHROW(cudaMalloc(&devHandle0, sizeof(mscclpp::BaseMemoryChannelDeviceHandle))); MSCCLPP_CUDATHROW(cudaMemcpy(devHandle0, &memChanHandle0, sizeof(memChanHandle0), cudaMemcpyHostToDevice)); @@ -98,14 +98,14 @@ int main() { mscclpp::Semaphore sema1(/*localSemaphoreStub*/ semaStub1, /*remoteSemaphoreStub*/ semaStub0); mscclpp::BaseMemoryChannel memChan1(sema1); mscclpp::BaseMemoryChannelDeviceHandle memChanHandle1 = memChan1.deviceHandle(); - void *devHandle1; + void* devHandle1; MSCCLPP_CUDATHROW(cudaMalloc(&devHandle1, sizeof(mscclpp::BaseMemoryChannelDeviceHandle))); MSCCLPP_CUDATHROW(cudaMemcpy(devHandle1, &memChanHandle1, sizeof(memChanHandle1), cudaMemcpyHostToDevice)); log("GPU 0: Launching gpuKernel0 ..."); MSCCLPP_CUDATHROW(cudaSetDevice(0)); - gpuKernel0<<<1, 1>>>(reinterpret_cast(devHandle0), iter); + gpuKernel0<<<1, 1>>>(reinterpret_cast(devHandle0), iter); MSCCLPP_CUDATHROW(cudaGetLastError()); log("GPU 1: Launching gpuKernel1 ..."); @@ -115,7 +115,7 @@ int main() { MSCCLPP_CUDATHROW(cudaEventCreate(&start)); MSCCLPP_CUDATHROW(cudaEventCreate(&end)); MSCCLPP_CUDATHROW(cudaEventRecord(start)); - gpuKernel1<<<1, 1>>>(reinterpret_cast(devHandle1), iter); + gpuKernel1<<<1, 1>>>(reinterpret_cast(devHandle1), iter); MSCCLPP_CUDATHROW(cudaGetLastError()); MSCCLPP_CUDATHROW(cudaEventRecord(end)); MSCCLPP_CUDATHROW(cudaEventSynchronize(end)); diff --git a/examples/tutorials/02-bootstrap/gpu_ping_pong_mp.cu b/examples/tutorials/02-bootstrap/gpu_ping_pong_mp.cu index 05eb1b25..0526407e 100644 --- a/examples/tutorials/02-bootstrap/gpu_ping_pong_mp.cu +++ b/examples/tutorials/02-bootstrap/gpu_ping_pong_mp.cu @@ -14,7 +14,7 @@ #define PORT_NUMBER "50505" template -void log(Args &&...args) { +void log(Args&&... args) { std::stringstream ss; (ss << ... << args); ss << std::endl; @@ -50,7 +50,7 @@ __device__ void spin_cycles(unsigned long long cycles) { } } -__global__ void gpuKernel0(mscclpp::BaseMemoryChannelDeviceHandle *devHandle, int iter) { +__global__ void gpuKernel0(mscclpp::BaseMemoryChannelDeviceHandle* devHandle, int iter) { if (threadIdx.x + blockIdx.x * blockDim.x == 0) { for (int i = 0; i < iter; ++i) { devHandle->relaxedWait(); @@ -61,7 +61,7 @@ __global__ void gpuKernel0(mscclpp::BaseMemoryChannelDeviceHandle *devHandle, in } } -__global__ void gpuKernel1(mscclpp::BaseMemoryChannelDeviceHandle *devHandle, int iter) { +__global__ void gpuKernel1(mscclpp::BaseMemoryChannelDeviceHandle* devHandle, int iter) { if (threadIdx.x + blockIdx.x * blockDim.x == 0) { for (int i = 0; i < iter; ++i) { devHandle->relaxedSignal(); @@ -115,14 +115,14 @@ void worker(int gpuId) { mscclpp::BaseMemoryChannel memChan(sema); auto memChanHandle = memChan.deviceHandle(); - void *devHandle; + void* devHandle; MSCCLPP_CUDATHROW(cudaMalloc(&devHandle, sizeof(memChanHandle))); MSCCLPP_CUDATHROW(cudaMemcpy(devHandle, &memChanHandle, sizeof(memChanHandle), cudaMemcpyHostToDevice)); log("GPU ", gpuId, ": Launching a GPU kernel ..."); if (gpuId == 0) { - gpuKernel0<<<1, 1>>>(reinterpret_cast(devHandle), iter); + gpuKernel0<<<1, 1>>>(reinterpret_cast(devHandle), iter); MSCCLPP_CUDATHROW(cudaGetLastError()); MSCCLPP_CUDATHROW(cudaDeviceSynchronize()); } else { @@ -130,7 +130,7 @@ void worker(int gpuId) { MSCCLPP_CUDATHROW(cudaEventCreate(&start)); MSCCLPP_CUDATHROW(cudaEventCreate(&end)); MSCCLPP_CUDATHROW(cudaEventRecord(start)); - gpuKernel1<<<1, 1>>>(reinterpret_cast(devHandle), iter); + gpuKernel1<<<1, 1>>>(reinterpret_cast(devHandle), iter); MSCCLPP_CUDATHROW(cudaGetLastError()); MSCCLPP_CUDATHROW(cudaEventRecord(end)); MSCCLPP_CUDATHROW(cudaEventSynchronize(end)); diff --git a/examples/tutorials/03-memory-channel/bidir_memory_channel.cu b/examples/tutorials/03-memory-channel/bidir_memory_channel.cu index cfbf12d7..a1be59f2 100644 --- a/examples/tutorials/03-memory-channel/bidir_memory_channel.cu +++ b/examples/tutorials/03-memory-channel/bidir_memory_channel.cu @@ -16,7 +16,7 @@ #define PORT_NUMBER "50505" template -void log(Args &&...args) { +void log(Args&&... args) { std::stringstream ss; (ss << ... << args); ss << std::endl; @@ -47,7 +47,7 @@ int wait_process(int pid) { __device__ mscclpp::DeviceSyncer devSyncer; -__global__ void bidirPutKernel(mscclpp::MemoryChannelDeviceHandle *devHandle, size_t copyBytes, int myRank) { +__global__ void bidirPutKernel(mscclpp::MemoryChannelDeviceHandle* devHandle, size_t copyBytes, int myRank) { const int tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid == 0) { devHandle->relaxedSignal(); @@ -65,7 +65,7 @@ __global__ void bidirPutKernel(mscclpp::MemoryChannelDeviceHandle *devHandle, si } } -__global__ void bidirGetKernel(mscclpp::MemoryChannelDeviceHandle *devHandle, size_t copyBytes, int myRank) { +__global__ void bidirGetKernel(mscclpp::MemoryChannelDeviceHandle* devHandle, size_t copyBytes, int myRank) { const int tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid == 0) { devHandle->relaxedSignal(); @@ -79,7 +79,7 @@ __global__ void bidirGetKernel(mscclpp::MemoryChannelDeviceHandle *devHandle, si devHandle->get(srcOffset, dstOffset, copyBytes, /*threadId*/ tid, /*numThreads*/ blockDim.x * gridDim.x); } -__global__ void bidirPutPacketKernel(mscclpp::MemoryChannelDeviceHandle *devHandle, size_t copyBytes, int myRank, +__global__ void bidirPutPacketKernel(mscclpp::MemoryChannelDeviceHandle* devHandle, size_t copyBytes, int myRank, uint32_t flag) { const int tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid == 0) { @@ -95,7 +95,7 @@ __global__ void bidirPutPacketKernel(mscclpp::MemoryChannelDeviceHandle *devHand devHandle->unpackPackets(pktBufOffset, dstOffset, copyBytes, tid, blockDim.x * gridDim.x, flag); } -void worker(int myRank, int gpuId, const std::string &ipPort) { +void worker(int myRank, int gpuId, const std::string& ipPort) { MSCCLPP_CUDATHROW(cudaSetDevice(gpuId)); const int remoteRank = myRank == 0 ? 1 : 0; const int nRanks = 2; @@ -132,8 +132,8 @@ void worker(int myRank, int gpuId, const std::string &ipPort) { auto memChanHandle = memChan.deviceHandle(); auto memPktChanHandle = memPktChan.deviceHandle(); - void *devHandle; - void *devPktHandle; + void* devHandle; + void* devPktHandle; MSCCLPP_CUDATHROW(cudaMalloc(&devHandle, sizeof(memChanHandle))); MSCCLPP_CUDATHROW(cudaMalloc(&devPktHandle, sizeof(memPktChanHandle))); MSCCLPP_CUDATHROW(cudaMemcpy(devHandle, &memChanHandle, sizeof(memChanHandle), cudaMemcpyHostToDevice)); @@ -145,18 +145,18 @@ void worker(int myRank, int gpuId, const std::string &ipPort) { std::function kernels[3]; kernels[0] = [&](size_t copyBytes) { - bidirPutKernel<<<32, 1024, 0, stream>>>(reinterpret_cast(devHandle), - copyBytes, myRank); + bidirPutKernel<<<32, 1024, 0, stream>>>(reinterpret_cast(devHandle), copyBytes, + myRank); }; kernels[1] = [&](size_t copyBytes) { - bidirGetKernel<<<32, 1024, 0, stream>>>(reinterpret_cast(devHandle), - copyBytes, myRank); + bidirGetKernel<<<32, 1024, 0, stream>>>(reinterpret_cast(devHandle), copyBytes, + myRank); }; kernels[2] = [&](size_t copyBytes) { static uint32_t flag = 1; - bidirPutPacketKernel<<<32, 1024, 0, stream>>>(reinterpret_cast(devPktHandle), + bidirPutPacketKernel<<<32, 1024, 0, stream>>>(reinterpret_cast(devPktHandle), copyBytes, myRank, flag++); }; @@ -215,7 +215,7 @@ void worker(int myRank, int gpuId, const std::string &ipPort) { bootstrap->barrier(); } -int main(int argc, char **argv) { +int main(int argc, char** argv) { if (argc == 1) { int pid0 = spawn_process([]() { worker(0, 0, "lo:127.0.0.1:" PORT_NUMBER); }); int pid1 = spawn_process([]() { worker(1, 1, "lo:127.0.0.1:" PORT_NUMBER); }); @@ -241,7 +241,7 @@ int main(int argc, char **argv) { try { rank = std::stoi(argv[2]); gpuId = std::stoi(argv[3]); - } catch (const std::exception &) { + } catch (const std::exception&) { log("Error: rank and gpu_id must be valid integers."); return -1; } diff --git a/examples/tutorials/04-port-channel/bidir_port_channel.cu b/examples/tutorials/04-port-channel/bidir_port_channel.cu index 46064581..9e6d61dd 100644 --- a/examples/tutorials/04-port-channel/bidir_port_channel.cu +++ b/examples/tutorials/04-port-channel/bidir_port_channel.cu @@ -16,7 +16,7 @@ #define PORT_NUMBER "50505" template -void log(Args &&...args) { +void log(Args&&... args) { std::stringstream ss; (ss << ... << args); ss << std::endl; @@ -45,7 +45,7 @@ int wait_process(int pid) { return -1; } -__global__ void bidirPutKernel(mscclpp::PortChannelDeviceHandle *devHandle, size_t copyBytes, int myRank) { +__global__ void bidirPutKernel(mscclpp::PortChannelDeviceHandle* devHandle, size_t copyBytes, int myRank) { const int tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid == 0) { devHandle->signal(); @@ -58,7 +58,7 @@ __global__ void bidirPutKernel(mscclpp::PortChannelDeviceHandle *devHandle, size } } -void worker(int rank, int gpuId, const std::string &ipPort, mscclpp::Transport transport) { +void worker(int rank, int gpuId, const std::string& ipPort, mscclpp::Transport transport) { MSCCLPP_CUDATHROW(cudaSetDevice(gpuId)); const int myRank = rank; const int remoteRank = myRank == 0 ? 1 : 0; @@ -90,7 +90,7 @@ void worker(int rank, int gpuId, const std::string &ipPort, mscclpp::Transport t auto portChanHandle = portChan.deviceHandle(); - void *devHandle; + void* devHandle; MSCCLPP_CUDATHROW(cudaMalloc(&devHandle, sizeof(portChanHandle))); MSCCLPP_CUDATHROW(cudaMemcpy(devHandle, &portChanHandle, sizeof(portChanHandle), cudaMemcpyHostToDevice)); @@ -100,7 +100,7 @@ void worker(int rank, int gpuId, const std::string &ipPort, mscclpp::Transport t std::function kernels[1]; kernels[0] = [&](size_t copyBytes) { - bidirPutKernel<<<1, 1, 0, stream>>>(reinterpret_cast(devHandle), copyBytes, + bidirPutKernel<<<1, 1, 0, stream>>>(reinterpret_cast(devHandle), copyBytes, myRank); }; @@ -166,7 +166,7 @@ void worker(int rank, int gpuId, const std::string &ipPort, mscclpp::Transport t bootstrap->barrier(); } -mscclpp::Transport parseTransport(const std::string &transportStr) { +mscclpp::Transport parseTransport(const std::string& transportStr) { if (transportStr == "CudaIpc") return mscclpp::Transport::CudaIpc; if (transportStr == "IB0") return mscclpp::Transport::IB0; if (transportStr == "IB1") return mscclpp::Transport::IB1; @@ -180,7 +180,7 @@ mscclpp::Transport parseTransport(const std::string &transportStr) { throw std::runtime_error("Unknown transport: " + transportStr); } -int main(int argc, char **argv) { +int main(int argc, char** argv) { if (argc == 1) { int pid0 = spawn_process([]() { worker(0, 0, "lo:127.0.0.1:" PORT_NUMBER, mscclpp::Transport::CudaIpc); }); int pid1 = spawn_process([]() { worker(1, 1, "lo:127.0.0.1:" PORT_NUMBER, mscclpp::Transport::CudaIpc); }); diff --git a/include/mscclpp/assert_device.hpp b/include/mscclpp/assert_device.hpp index bf982ba6..1b9cb611 100644 --- a/include/mscclpp/assert_device.hpp +++ b/include/mscclpp/assert_device.hpp @@ -19,11 +19,11 @@ #else // defined(DEBUG_BUILD) #if defined(MSCCLPP_DEVICE_HIP) -extern "C" __device__ void __assert_fail(const char *__assertion, const char *__file, unsigned int __line, - const char *__function); +extern "C" __device__ void __assert_fail(const char* __assertion, const char* __file, unsigned int __line, + const char* __function); #else // !defined(MSCCLPP_DEVICE_HIP) -extern "C" __host__ __device__ void __assert_fail(const char *__assertion, const char *__file, unsigned int __line, - const char *__function) __THROW; +extern "C" __host__ __device__ void __assert_fail(const char* __assertion, const char* __file, unsigned int __line, + const char* __function) __THROW; #endif // !defined(MSCCLPP_DEVICE_HIP) /// Assert a condition on the device and print a message if the condition is false. diff --git a/python/csrc/error_py.cpp b/python/csrc/error_py.cpp index 208f4e84..c19a3b15 100644 --- a/python/csrc/error_py.cpp +++ b/python/csrc/error_py.cpp @@ -11,17 +11,17 @@ using namespace mscclpp; #define REGISTER_EXCEPTION_TRANSLATOR(name_) \ nb::register_exception_translator( \ - [](const std::exception_ptr &p, void *payload) { \ + [](const std::exception_ptr& p, void* payload) { \ try { \ std::rethrow_exception(p); \ - } catch (const name_ &e) { \ - PyErr_SetObject(reinterpret_cast(payload), \ + } catch (const name_& e) { \ + PyErr_SetObject(reinterpret_cast(payload), \ PyTuple_Pack(2, PyLong_FromLong(long(e.getErrorCode())), PyUnicode_FromString(e.what()))); \ } \ }, \ m.attr(#name_).ptr()); -void register_error(nb::module_ &m) { +void register_error(nb::module_& m) { nb::enum_(m, "CppErrorCode") .value("SystemError", ErrorCode::SystemError) .value("InternalError", ErrorCode::InternalError) diff --git a/python/csrc/npkit_py.cpp b/python/csrc/npkit_py.cpp index 8aaa8011..8c158354 100644 --- a/python/csrc/npkit_py.cpp +++ b/python/csrc/npkit_py.cpp @@ -8,7 +8,7 @@ namespace nb = nanobind; -void register_npkit(nb::module_ &m) { +void register_npkit(nb::module_& m) { nb::module_ sub_m = m.def_submodule("cpp_npkit", "NPKit functions"); sub_m.def("init", &NpKit::Init); sub_m.def("dump", &NpKit::Dump); diff --git a/python/csrc/numa_py.cpp b/python/csrc/numa_py.cpp index 4433ecc8..fadc0f69 100644 --- a/python/csrc/numa_py.cpp +++ b/python/csrc/numa_py.cpp @@ -6,7 +6,7 @@ int getDeviceNumaNode(int cudaDev); void numaBind(int node); }; // namespace mscclpp -void register_numa(nb::module_ &m) { +void register_numa(nb::module_& m) { nb::module_ sub_m = m.def_submodule("cpp_numa", "numa functions"); sub_m.def("get_device_numa_node", &mscclpp::getDeviceNumaNode); sub_m.def("numa_bind", &mscclpp::numaBind); diff --git a/src/core/context.cc b/src/core/context.cc index 9bf299d3..a5cdffb2 100644 --- a/src/core/context.cc +++ b/src/core/context.cc @@ -23,14 +23,14 @@ void CudaIpcStream::setStreamIfNeeded() { } } -void CudaIpcStream::memcpyD2D(void *dst, const void *src, size_t nbytes) { +void CudaIpcStream::memcpyD2D(void* dst, const void* src, size_t nbytes) { CudaDeviceGuard deviceGuard(deviceId_); setStreamIfNeeded(); MSCCLPP_CUDATHROW(cudaMemcpyAsync(dst, src, nbytes, cudaMemcpyDeviceToDevice, *stream_)); dirty_ = true; } -void CudaIpcStream::memcpyH2D(void *dst, const void *src, size_t nbytes) { +void CudaIpcStream::memcpyH2D(void* dst, const void* src, size_t nbytes) { CudaDeviceGuard deviceGuard(deviceId_); setStreamIfNeeded(); MSCCLPP_CUDATHROW(cudaMemcpyAsync(dst, src, nbytes, cudaMemcpyHostToDevice, *stream_)); @@ -48,7 +48,7 @@ void CudaIpcStream::sync() { Context::Impl::Impl() {} -IbCtx *Context::Impl::getIbContext(Transport ibTransport) { +IbCtx* Context::Impl::getIbContext(Transport ibTransport) { // Find IB context or create it auto it = ibContexts_.find(ibTransport); if (it == ibContexts_.end()) { @@ -70,7 +70,7 @@ MSCCLPP_API_CPP Context::Context() : pimpl_(std::make_unique()) {} MSCCLPP_API_CPP Context::~Context() = default; -MSCCLPP_API_CPP RegisteredMemory Context::registerMemory(void *ptr, size_t size, TransportFlags transports) { +MSCCLPP_API_CPP RegisteredMemory Context::registerMemory(void* ptr, size_t size, TransportFlags transports) { return RegisteredMemory(std::make_shared(ptr, size, transports, *pimpl_)); } @@ -78,7 +78,7 @@ MSCCLPP_API_CPP Endpoint Context::createEndpoint(EndpointConfig config) { return Endpoint(std::make_shared(config, *pimpl_)); } -MSCCLPP_API_CPP Connection Context::connect(const Endpoint &localEndpoint, const Endpoint &remoteEndpoint) { +MSCCLPP_API_CPP Connection Context::connect(const Endpoint& localEndpoint, const Endpoint& remoteEndpoint) { if (localEndpoint.device().type == DeviceType::GPU && localEndpoint.device().id < 0) { throw Error("No GPU device ID provided for local endpoint", ErrorCode::InvalidUsage); } diff --git a/src/core/include/context.hpp b/src/core/include/context.hpp index b53a2662..ee84d0f7 100644 --- a/src/core/include/context.hpp +++ b/src/core/include/context.hpp @@ -24,9 +24,9 @@ class CudaIpcStream { public: CudaIpcStream(int deviceId); - void memcpyD2D(void *dst, const void *src, size_t nbytes); + void memcpyD2D(void* dst, const void* src, size_t nbytes); - void memcpyH2D(void *dst, const void *src, size_t nbytes); + void memcpyH2D(void* dst, const void* src, size_t nbytes); void sync(); @@ -44,7 +44,7 @@ struct Context::Impl { Impl(); - IbCtx *getIbContext(Transport ibTransport); + IbCtx* getIbContext(Transport ibTransport); std::shared_ptr getToken(); }; diff --git a/src/core/include/gpu_ipc_mem.hpp b/src/core/include/gpu_ipc_mem.hpp index 98fa47f2..923e807d 100644 --- a/src/core/include/gpu_ipc_mem.hpp +++ b/src/core/include/gpu_ipc_mem.hpp @@ -46,7 +46,7 @@ struct GpuIpcMemHandle { char handle[64]; } fabric; - static void deleter(GpuIpcMemHandle *handle); + static void deleter(GpuIpcMemHandle* handle); // We make GpuIpcMemHandle trivially copyable for easy serialization, // and thus it cannot have explicit destructors. @@ -61,7 +61,7 @@ struct GpuIpcMemHandle { using Base::Base; // Allow implicit conversion from Base - UniquePtr(Base &&other) : Base(std::move(other)) {} + UniquePtr(Base&& other) : Base(std::move(other)) {} }; static UniquePtr create(const CUdeviceptr ptr); @@ -70,7 +70,7 @@ struct GpuIpcMemHandle { using UniqueGpuIpcMemHandle = GpuIpcMemHandle::UniquePtr; -std::ostream &operator<<(std::ostream &os, const GpuIpcMemHandle::TypeFlags &typeFlags); +std::ostream& operator<<(std::ostream& os, const GpuIpcMemHandle::TypeFlags& typeFlags); static_assert(std::is_trivially_copyable_v); @@ -82,7 +82,7 @@ class GpuIpcMem : public std::enable_shared_from_this { /// Create a GpuIpcMem instance from a GpuIpcMemHandle. /// @param handle The handle to import. /// @return A shared_ptr to the created GpuIpcMem instance. - static std::shared_ptr create(const GpuIpcMemHandle &handle); + static std::shared_ptr create(const GpuIpcMemHandle& handle); ~GpuIpcMem(); @@ -102,7 +102,7 @@ class GpuIpcMem : public std::enable_shared_from_this { std::shared_ptr mapMulticast(int numDevices, size_t mcOffset, CUdeviceptr bufferAddr, size_t bufferSize); private: - GpuIpcMem(const GpuIpcMemHandle &handle); + GpuIpcMem(const GpuIpcMemHandle& handle); GpuIpcMemHandle handle_; CUmemGenericAllocationHandle allocHandle_; diff --git a/src/core/include/ibverbs_wrapper.hpp b/src/core/include/ibverbs_wrapper.hpp index 45054ff3..b5ab2eff 100644 --- a/src/core/include/ibverbs_wrapper.hpp +++ b/src/core/include/ibverbs_wrapper.hpp @@ -12,12 +12,12 @@ namespace mscclpp { struct IBVerbs { private: - static void *dlsym(const std::string &symbol, bool allowReturnNull = false); + static void* dlsym(const std::string& symbol, bool allowReturnNull = false); public: #define REGISTER_IBV_FUNC_WITH_NAME(name__, func__) \ template \ - static inline auto(name__)(Args && ...args) { \ + static inline auto(name__)(Args && ... args) { \ static_assert(sizeof(&::func__) > 0, #func__ " is expected be a function, not a macro"); \ static decltype(&::func__) impl = nullptr; \ if (!impl) impl = reinterpret_cast(IBVerbs::dlsym(#func__)); \ @@ -46,7 +46,7 @@ struct IBVerbs { REGISTER_IBV_FUNC(ibv_wc_status_str) static bool isDmabufSupported(); - static struct ibv_mr *ibv_reg_dmabuf_mr(struct ibv_pd *, uint64_t, size_t, uint64_t, int, int); + static struct ibv_mr* ibv_reg_dmabuf_mr(struct ibv_pd*, uint64_t, size_t, uint64_t, int, int); /// /// Below is for cases where the API (may be / is) a macro. Refer to `infiniband/verbs.h`. @@ -57,8 +57,8 @@ struct IBVerbs { #else // defined(ibv_get_device_list) #undef ibv_get_device_list REGISTER_IBV_FUNC(ibv_static_providers) - static inline struct ibv_device **ibv_get_device_list(int *num_devices) { - using FuncType = struct ibv_device **(*)(int *); + static inline struct ibv_device** ibv_get_device_list(int* num_devices) { + using FuncType = struct ibv_device** (*)(int*); static FuncType impl = nullptr; if (!impl) impl = reinterpret_cast(IBVerbs::dlsym("ibv_get_device_list")); IBVerbs::ibv_static_providers(NULL, _RDMA_STATIC_PREFIX(RDMA_STATIC_PROVIDERS), NULL); @@ -67,21 +67,21 @@ struct IBVerbs { #endif // defined(ibv_get_device_list) #undef ibv_query_port - static inline int ibv_query_port(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr) { + static inline int ibv_query_port(struct ibv_context* context, uint8_t port_num, struct ibv_port_attr* port_attr) { static decltype(&::ibv_query_port) impl = nullptr; if (!impl) impl = reinterpret_cast(IBVerbs::dlsym("ibv_query_port")); - struct verbs_context *vctx = verbs_get_ctx_op(context, query_port); + struct verbs_context* vctx = verbs_get_ctx_op(context, query_port); if (!vctx) { int rc; ::memset(port_attr, 0, sizeof(*port_attr)); - rc = impl(context, port_num, (struct _compat_ibv_port_attr *)port_attr); + rc = impl(context, port_num, (struct _compat_ibv_port_attr*)port_attr); return rc; } return vctx->query_port(context, port_num, port_attr, sizeof(*port_attr)); } #undef ibv_reg_mr - static inline struct ibv_mr *ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access) { + static inline struct ibv_mr* ibv_reg_mr(struct ibv_pd* pd, void* addr, size_t length, int access) { static decltype(&::ibv_reg_mr) impl = nullptr; static decltype(&::ibv_reg_mr_iova2) impl_iova2 = nullptr; int is_access_const = __builtin_constant_p(((int)(access)&IBV_ACCESS_OPTIONAL_RANGE) == 0); @@ -98,11 +98,11 @@ struct IBVerbs { /// Below is for cases where the API (may be / is) a static function. Refer to `infiniband/verbs.h`. /// - static inline int ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) { + static inline int ibv_post_send(struct ibv_qp* qp, struct ibv_send_wr* wr, struct ibv_send_wr** bad_wr) { return qp->context->ops.post_send(qp, wr, bad_wr); } - static inline int ibv_poll_cq(struct ibv_cq *cq, int num_entries, struct ibv_wc *wc) { + static inline int ibv_poll_cq(struct ibv_cq* cq, int num_entries, struct ibv_wc* wc) { return cq->context->ops.poll_cq(cq, num_entries, wc); } }; diff --git a/src/ext/nccl/audit-shim/audit_nccl.cc b/src/ext/nccl/audit-shim/audit_nccl.cc index 5e3ab6f2..7fdeb67b 100644 --- a/src/ext/nccl/audit-shim/audit_nccl.cc +++ b/src/ext/nccl/audit-shim/audit_nccl.cc @@ -8,11 +8,11 @@ extern "C" __attribute__((visibility("default"))) unsigned int la_version(unsigned int) { return LAV_CURRENT; } -extern "C" __attribute__((visibility("default"))) char *la_objsearch(const char *name, uintptr_t *, unsigned int) { - const char *library = "libmscclpp_nccl.so"; +extern "C" __attribute__((visibility("default"))) char* la_objsearch(const char* name, uintptr_t*, unsigned int) { + const char* library = "libmscclpp_nccl.so"; if (strcmp(name, "libnccl.so.2") && strcmp(name, "libnccl.so") && strcmp(name, "librccl.so") && strcmp(name, "librccl.so.1")) { - return (char *)name; + return (char*)name; } - return (char *)library; + return (char*)library; } \ No newline at end of file diff --git a/test/unit/local_channel_tests.cu b/test/unit/local_channel_tests.cu index 7414f6bb..50ffc9ea 100644 --- a/test/unit/local_channel_tests.cu +++ b/test/unit/local_channel_tests.cu @@ -12,10 +12,10 @@ __constant__ mscclpp::PortChannelDeviceHandle gPortChannel; -__global__ void kernelLocalPortChannelTest(void *dst, void *src, size_t bytes, int *ret) { +__global__ void kernelLocalPortChannelTest(void* dst, void* src, size_t bytes, int* ret) { if (blockIdx.x == 0) { // sender - int *ptr = reinterpret_cast(src); + int* ptr = reinterpret_cast(src); for (size_t idx = threadIdx.x; idx < bytes / sizeof(int); idx += blockDim.x) { ptr[idx] = MAGIC_CONST; } @@ -29,7 +29,7 @@ __global__ void kernelLocalPortChannelTest(void *dst, void *src, size_t bytes, i gPortChannel.wait(); } __syncthreads(); - int *ptr = reinterpret_cast(dst); + int* ptr = reinterpret_cast(dst); for (size_t idx = threadIdx.x; idx < bytes / sizeof(int); idx += blockDim.x) { if (ptr[idx] != MAGIC_CONST) { *ret = 1; // Error: value mismatch From d7925448f38e5e9236ca571b524f4bb01df6f02f Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Sat, 7 Feb 2026 04:27:01 +0900 Subject: [PATCH 06/52] Update `copilot-instructions.md` (#722) --- .github/copilot-instructions.md | 10 ++++++++-- .gitignore | 6 ++---- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index 4cf9dbf8..4f13c557 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -25,7 +25,7 @@ For C/C++/CUDA source code: ``` ## Formatting -If you have modified any code in the project, run `./tools/lint.sh` to automatically format the entire source code before finishing iterations. Note that this script formats only staged files. +If you have modified any code in the project, run `./tools/lint.sh` to automatically format the entire source code before finishing iterations. Note that this script formats only files that are tracked by git, so if you have added new files, make sure to `git add` them first. ## Building and Testing The following commands are commonly used for building and testing the project. See `docs/quickstart.md` for more detailed instructions. @@ -40,7 +40,7 @@ cd .. For testing after successful build: ```bash -# To run all tests +# To run tests with two GPUs - two is enough for most tests mpirun -np 2 ./build/bin/mp_unit_tests # To run tests excluding IB-related ones (when IB is not available) mpirun -np 2 ./build/bin/mp_unit_tests --gtest_filter=-*Ib* @@ -51,6 +51,12 @@ For building a Python package: python3 -m pip install -e . ``` +For Python tests after building the package: +```bash +# Run tests with 8 GPUs - adjust the number as needed +mpirun -np 8 python3 -m pytest ./python/test/test_mscclpp.py -vx +``` + For building documentation (see dependencies in `docs/requirements.txt`): ```bash cd docs diff --git a/.gitignore b/.gitignore index 9c4da143..ed3b94c4 100644 --- a/.gitignore +++ b/.gitignore @@ -1,10 +1,8 @@ .vscode/ -.hypothesis/ build/ -dist/ __pycache__ .*.swp -.idea/ *.so +.pytest_cache/ +_codeql_detected_source_root docs/_static/versions.js -_codeql_detected_source_root \ No newline at end of file From c12822a7af908c6aec7c07385d639f04dd329e35 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Mon, 9 Feb 2026 16:55:16 -0800 Subject: [PATCH 07/52] create CI pipeline for rocm (#718) Create CI pipeline for AMD GPU. --- .azure-pipelines/integration-test-rocm.yml | 114 --------------------- .azure-pipelines/templates/ut.yaml | 13 ++- .azure-pipelines/ut-rocm.yml | 50 +++++++++ README.md | 2 +- docker/base-dev-x.dockerfile | 14 ++- docker/base-x-rocm.dockerfile | 19 ---- docker/build.sh | 14 +-- python/requirements_cuda13.txt | 3 +- python/requirements_rocm6.txt | 10 ++ test/CMakeLists.txt | 3 + test/deploy/deploy.sh | 17 ++- test/deploy/setup.sh | 17 ++- 12 files changed, 118 insertions(+), 158 deletions(-) delete mode 100644 .azure-pipelines/integration-test-rocm.yml create mode 100644 .azure-pipelines/ut-rocm.yml delete mode 100644 docker/base-x-rocm.dockerfile diff --git a/.azure-pipelines/integration-test-rocm.yml b/.azure-pipelines/integration-test-rocm.yml deleted file mode 100644 index a4ffcfc3..00000000 --- a/.azure-pipelines/integration-test-rocm.yml +++ /dev/null @@ -1,114 +0,0 @@ -trigger: - branches: - include: - - main - - release/* - paths: - exclude: - - .devcontainer/** - - .github/** - - docker/** - - docs/** - - '**/*.md' - -pr: - branches: - include: - - main - - release/* - drafts: false - paths: - exclude: - - .devcontainer/** - - .github/** - - docker/** - - docs/** - - '**/*.md' - -jobs: -- job: IntegrationTestRocm - displayName: Integration test ROCm - strategy: - matrix: - rocm6.2: - containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2 - - pool: - name: mscclpp-rocm - container: - image: $[ variables['containerImage'] ] - options: --privileged --ipc=host --security-opt seccomp=unconfined --group-add video --ulimit memlock=-1:-1 - - steps: - - task: Bash@3 - name: Build - displayName: Build - inputs: - targetType: 'inline' - script: | - mkdir build && cd build - CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON .. - make -j - workingDirectory: '$(System.DefaultWorkingDirectory)' - - - task: Bash@3 - name: InstallRcclTest - displayName: Install rccl-test - inputs: - targetType: 'inline' - script: | - git clone https://github.com/ROCm/rccl-tests.git - cd rccl-tests - make MPI=1 MPI_HOME=/usr/local/mpi HIP_HOME=/opt/rocm -j - workingDirectory: '$(System.DefaultWorkingDirectory)' - - - task: Bash@3 - name: InstallDep - displayName: Install dependencies - inputs: - targetType: 'inline' - script: | - set -e - git clone https://github.com/Azure/msccl-tools.git - cd msccl-tools - pip3 install . - - - task: Bash@3 - name: GenerateExectionFiles - displayName: Generate execution files - inputs: - targetType: 'inline' - script: | - set -e - git clone https://$(GIT_USER):$(GIT_PAT)@msazure.visualstudio.com/DefaultCollection/One/_git/msccl-users - cd msccl-users - mkdir execution-files - python3 algos/allreduce_mi300_packet.py 8 8 > execution-files/allreduce_mi300_packet.json - python3 algos/allreduce_mi300_sm_mscclpp.py 8 8 > execution-files/allreduce_mi300_sm_mscclpp.json - - - task: Bash@3 - name: AllReduceTest - displayName: Run mscclpp allReduce test - inputs: - targetType: 'inline' - script: | - set -e - export PATH=/usr/local/mpi/bin:$PATH - sudo mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN -x LD_PRELOAD="$(pwd)/build/lib/libmscclpp_nccl.so" \ - -x ALLREDUCE_SMALL_MSG_BOUNDARY=32K -x ALLREDUCE_LARGE_MSG_BOUNDARY=1M ./rccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 100 - workingDirectory: '$(System.DefaultWorkingDirectory)' - - - task: Bash@3 - name: AllReduceWithExecutionFileTest - displayName: Run mscclpp allReduce with execution file - inputs: - targetType: 'inline' - script: | - set -e - export PATH=/usr/local/mpi/bin:$PATH - sudo mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=$(pwd)/build/lib/libmscclpp_nccl.so -x NCCL_DEBUG=WARN \ - -x ALLREDUCEPKT_IP_JSON_FILE=./msccl-users/execution-files/allreduce_mi300_packet.json \ - -x ALLREDUCE_IP_JSON_FILE=./msccl-users/execution-files/allreduce_mi300_sm_mscclpp.json \ - -x ALLREDUCE_SMALL_MSG_BOUNDARY=32K -x ALLREDUCE_LARGE_MSG_BOUNDARY=1M ./rccl-tests/build/all_reduce_perf \ - -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 100 - workingDirectory: '$(System.DefaultWorkingDirectory)' diff --git a/.azure-pipelines/templates/ut.yaml b/.azure-pipelines/templates/ut.yaml index 093a6094..82ff4aac 100644 --- a/.azure-pipelines/templates/ut.yaml +++ b/.azure-pipelines/templates/ut.yaml @@ -5,6 +5,9 @@ parameters: type: string - name: sshKeySecureFile type: string +- name: platform + type: string + default: 'cuda' - name: gpuArch type: string @@ -16,7 +19,11 @@ steps: targetType: 'inline' script: | mkdir build && cd build - cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} .. + if [ "${{ parameters.platform }}" == "rocm" ]; then + CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} .. + else + cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} .. + fi make -j workingDirectory: '$(System.DefaultWorkingDirectory)' @@ -52,7 +59,7 @@ steps: inputs: targetType: filePath filePath: test/deploy/deploy.sh - arguments: "single-node-test" + arguments: "single-node-test true ${{ parameters.platform }}" workingDirectory: '$(System.DefaultWorkingDirectory)' @@ -119,7 +126,7 @@ steps: export PATH=/usr/local/mpi/bin:\$PATH \ export LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH; \ cd /root/mscclpp; \ - mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x"' + mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=/root/mscclpp -x GPU_MAX_HW_QUEUES=8 -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x"' kill $CHILD_PID workingDirectory: '$(System.DefaultWorkingDirectory)' diff --git a/.azure-pipelines/ut-rocm.yml b/.azure-pipelines/ut-rocm.yml new file mode 100644 index 00000000..8b0aed1a --- /dev/null +++ b/.azure-pipelines/ut-rocm.yml @@ -0,0 +1,50 @@ +trigger: + branches: + include: + - main + - release/* + paths: + exclude: + - .devcontainer/** + - .github/** + - apps/** + - docker/** + - docs/** + - '**/*.md' + +pr: + branches: + include: + - main + - release/* + drafts: false + paths: + exclude: + - .devcontainer/** + - .github/** + - apps/** + - docker/** + - docs/** + - '**/*.md' + +jobs: +- job: UnitTestMI300X + timeoutInMinutes: 40 + pool: + name: msccl-ci-mi300x + strategy: + matrix: + rocm6_2: + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-rocm6.2 + + container: + image: $(containerImage) + + steps: + - template: templates/ut.yaml + parameters: + subscription: mscclpp-ci-mi300x + vmssName: mscclpp-mi300x-ci + sshKeySecureFile: mscclpp.pem + platform: rocm + gpuArch: gfx942 diff --git a/README.md b/README.md index 69ae5add..8f300a2a 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ |--------------------------|-------------------| | Unit Tests (CUDA) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-ut?branchName=main)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398325&branchName=main) | | Integration Tests (CUDA) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-test?branchName=main)](https://msazure.visualstudio.com/One/_build/latest?definitionId=398479&branchName=main) | -| Integration Tests (ROCm) | [![Build Status](https://dev.azure.com/msazure/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-test-rocm?branchName=main)](https://dev.azure.com/msazure/One/_build/latest?definitionId=399295&branchName=main) | +| Unit Tests (ROCm) | [![Build Status](https://msazure.visualstudio.com/One/_apis/build/status%2FCustom%2FMSCCLPP%2Fmscclpp-ut-rocm?branchName=main)](https://msazure.visualstudio.com/One/_build/latest?definitionId=399295&branchName=main) | A GPU-driven communication stack for scalable AI applications. diff --git a/docker/base-dev-x.dockerfile b/docker/base-dev-x.dockerfile index 04ba1f03..3aa81422 100644 --- a/docker/base-dev-x.dockerfile +++ b/docker/base-dev-x.dockerfile @@ -24,6 +24,16 @@ RUN OS_ARCH=$(uname -m) && \ rm -rf ${CMAKE_HOME}.tar.gz && \ ln -s /usr/local/cmake-${CMAKE_VERSION}-linux-${OS_ARCH}/bin/* /usr/bin/ +# Install ROCm-specific packages if building for ROCm +ARG TARGET="cuda13.0" +RUN if echo "$TARGET" | grep -q "^rocm"; then \ + apt-get update -y && \ + apt-get install -y hipblas hipsparse rocsparse rocrand hiprand rocthrust rocsolver rocfft hipfft hipcub rocprim rccl roctracer-dev && \ + apt-get autoremove -y && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* /tmp/*; \ + fi + # Create Python venv RUN python3 -m venv /root/venv && \ echo 'source /root/venv/bin/activate' >> /root/.bashrc @@ -32,8 +42,10 @@ ENV PATH="/root/venv/bin:${PATH}" # Install Python dependencies ADD . /tmp/mscclpp WORKDIR /tmp/mscclpp -ARG TARGET="cuda13.0" RUN target_type=$(echo $TARGET | sed 's/\.[0-9]*$//') && \ + if echo "$TARGET" | grep -q "^rocm"; then \ + export CUPY_INSTALL_USE_HIP=1 && export ROCM_HOME=/opt/rocm; \ + fi && \ pip install --no-cache-dir --upgrade pip && \ pip install --no-cache-dir -r python/requirements_${target_type}.txt diff --git a/docker/base-x-rocm.dockerfile b/docker/base-x-rocm.dockerfile deleted file mode 100644 index 525ba1d4..00000000 --- a/docker/base-x-rocm.dockerfile +++ /dev/null @@ -1,19 +0,0 @@ -ARG BASE_IMAGE -FROM ${BASE_IMAGE} - -LABEL maintainer="MSCCL++" -LABEL org.opencontainers.image.source=https://github.com/microsoft/mscclpp - -ENV DEBIAN_FRONTEND=noninteractive - -ENV RCCL_VERSION=rocm-6.2.0 -ARG GPU_ARCH=gfx942 -ENV ARCH_TARGET=${GPU_ARCH} -RUN cd /tmp && \ - git clone --branch ${RCCL_VERSION} --depth 1 https://github.com/ROCm/rccl.git && \ - cd rccl && \ - ./install.sh --prefix=/opt/rocm --amdgpu_targets ${ARCH_TARGET} && \ - cd .. && \ - rm -rf /tmp/rccl - -WORKDIR / diff --git a/docker/build.sh b/docker/build.sh index e9b10c3a..63552f74 100755 --- a/docker/build.sh +++ b/docker/build.sh @@ -12,7 +12,7 @@ baseImageTable=( ["cuda12.8"]="nvidia/cuda:12.8.1-devel-ubuntu22.04" ["cuda12.9"]="nvidia/cuda:12.9.1-devel-ubuntu22.04" ["cuda13.0"]="nvidia/cuda:13.0.2-devel-ubuntu24.04" - ["rocm6.2"]="rocm/rocm-terminal:6.2.1" + ["rocm6.2"]="rocm/dev-ubuntu-22.04:6.2.2" ) declare -A extraLdPathTable @@ -29,6 +29,7 @@ ofedVersionTable=( ["cuda12.8"]="24.10-1.1.4.0" ["cuda12.9"]="24.10-1.1.4.0" ["cuda13.0"]="24.10-3.2.5.0" + ["rocm6.2"]="24.10-1.1.4.0" ) TARGET=${1} @@ -68,18 +69,11 @@ docker build -t ${TAG_TMP} \ if [[ ${TARGET} == rocm* ]]; then echo "Building ROCm base image..." - docker build -t ${TAG_BASE} \ - -f docker/base-x-rocm.dockerfile \ - --build-arg BASE_IMAGE=${TAG_TMP} \ - --build-arg EXTRA_LD_PATH=${extraLdPathTable[${TARGET}]} \ - --build-arg TARGET=${TARGET} \ - --build-arg GPU_ARCH="gfx942" . - docker rmi ${TAG_TMP} else echo "Building CUDA base image..." - docker tag ${TAG_TMP} ${TAG_BASE} - docker rmi --no-prune ${TAG_TMP} fi +docker tag ${TAG_TMP} ${TAG_BASE} +docker rmi --no-prune ${TAG_TMP} docker build -t ${TAG_BASE_DEV} \ -f docker/base-dev-x.dockerfile \ diff --git a/python/requirements_cuda13.txt b/python/requirements_cuda13.txt index b49a404c..49cf13bc 100644 --- a/python/requirements_cuda13.txt +++ b/python/requirements_cuda13.txt @@ -6,4 +6,5 @@ pytest numpy matplotlib sortedcontainers @ git+https://github.com/grantjenks/python-sortedcontainers.git@3ac358631f58c1347f1d6d2d92784117db0f38ed -blake3 \ No newline at end of file +blake3 +pybind11 \ No newline at end of file diff --git a/python/requirements_rocm6.txt b/python/requirements_rocm6.txt index e69de29b..d2a3389b 100644 --- a/python/requirements_rocm6.txt +++ b/python/requirements_rocm6.txt @@ -0,0 +1,10 @@ +mpi4py==4.1.1 +cupy==13.6.0 +prettytable +netifaces +pytest +numpy +matplotlib +sortedcontainers @ git+https://github.com/grantjenks/python-sortedcontainers.git@3ac358631f58c1347f1d6d2d92784117db0f38ed +blake3 +pybind11 \ No newline at end of file diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 8e41aac5..6452ebf8 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -14,6 +14,9 @@ set(TEST_INC_INTERNAL PRIVATE ${PROJECT_SOURCE_DIR}/src/core/include) if(MSCCLPP_USE_ROCM) file(GLOB_RECURSE CU_SOURCES CONFIGURE_DEPENDS *.cu) set_source_files_properties(${CU_SOURCES} PROPERTIES LANGUAGE CXX) + foreach(arch ${MSCCLPP_GPU_ARCHS}) + add_compile_options(--offload-arch=${arch}) + endforeach() endif() function(add_test_executable name sources) diff --git a/test/deploy/deploy.sh b/test/deploy/deploy.sh index ccf85abd..b26ff1a8 100644 --- a/test/deploy/deploy.sh +++ b/test/deploy/deploy.sh @@ -1,8 +1,8 @@ set -e -# get parameter from $1 and $2 TEST_NAME=$1 IB_ENVIRONMENT="${2:-true}" +PLATFORM="${3:-cuda}" KeyFilePath=${SSHKEYFILE_SECUREFILEPATH} ROOT_DIR="${SYSTEM_DEFAULTWORKINGDIRECTORY}/" @@ -35,20 +35,29 @@ set -e parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "sudo rm -rf ${DST_DIR}" parallel-scp -t 0 -r -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${ROOT_DIR} ${DST_DIR} +if [ "${PLATFORM}" == "rocm" ]; then + parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "sudo modprobe amdgpu" +fi + # force to pull the latest image parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \ "sudo docker pull ${CONTAINERIMAGE}" + +LAUNCH_OPTION="--gpus=all" +if [ "${PLATFORM}" == "rocm" ]; then + LAUNCH_OPTION="--device=/dev/kfd --device=/dev/dri --group-add=video" +fi if [ "${IB_ENVIRONMENT}" == "true" ]; then parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \ - "sudo docker run --rm -itd --privileged --net=host --ipc=host --gpus=all \ + "sudo docker run --rm -itd --privileged --net=host --ipc=host ${LAUNCH_OPTION} \ -w /root -v ${DST_DIR}:/root/mscclpp -v /opt/microsoft:/opt/microsoft --ulimit memlock=-1:-1 --name=mscclpp-test \ --entrypoint /bin/bash ${CONTAINERIMAGE}" else parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \ - "sudo docker run --rm -itd --net=host --ipc=host --gpus=all --cap-add=SYS_ADMIN --security-opt seccomp=unconfined \ + "sudo docker run --rm -itd --net=host --ipc=host ${LAUNCH_OPTION} --cap-add=SYS_ADMIN --security-opt seccomp=unconfined \ -w /root -v ${DST_DIR}:/root/mscclpp -v /opt/microsoft:/opt/microsoft --ulimit memlock=-1:-1 --name=mscclpp-test \ --entrypoint /bin/bash ${CONTAINERIMAGE}" fi parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \ - "sudo docker exec -t --user root mscclpp-test bash '/root/mscclpp/test/deploy/setup.sh'" + "sudo docker exec -t --user root mscclpp-test bash '/root/mscclpp/test/deploy/setup.sh' ${PLATFORM}" diff --git a/test/deploy/setup.sh b/test/deploy/setup.sh index 4916d2eb..80cd10b1 100644 --- a/test/deploy/setup.sh +++ b/test/deploy/setup.sh @@ -1,5 +1,7 @@ set -e +PLATFORM="${1:-cuda}" + mkdir -p /root/.ssh mv /root/mscclpp/sshkey.pub /root/.ssh/authorized_keys chown root:root /root/.ssh/authorized_keys @@ -8,10 +10,12 @@ chown root:root /root/.ssh/config chmod 400 /root/mscclpp/sshkey chown root:root /root/mscclpp/sshkey -nvidia-smi -pm 1 -for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do - nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i -done +if [ "${PLATFORM}" == "cuda" ]; then + nvidia-smi -pm 1 + for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do + nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i + done +fi make -C /root/mscclpp/tools/peer-access-test /root/mscclpp/tools/peer-access-test/peer_access_test @@ -19,10 +23,13 @@ make -C /root/mscclpp/tools/peer-access-test clean if [[ "${CUDA_VERSION}" == *"11."* ]]; then pip3 install -r /root/mscclpp/python/requirements_cuda11.txt -else +elif [[ "${CUDA_VERSION}" == *"12."* ]]; then pip3 install -r /root/mscclpp/python/requirements_cuda12.txt fi +if [ "${PLATFORM}" == "rocm" ]; then + export CXX=/opt/rocm/bin/hipcc +fi cd /root/mscclpp && pip3 install . pip3 install setuptools_scm python3 -m setuptools_scm --force-write-version-files From 42be3660e0db0279e02ed262edb03202d1570e74 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Tue, 10 Feb 2026 10:07:53 +0900 Subject: [PATCH 08/52] Add a new IB stack impl that doesn't use RDMA atomics (#728) * Added configurable InfiniBand (IB) signaling mode. `EndpointConfig::Ib::Mode` enum selects the mode (`Default`, `Host`, `HostNoAtomic`). `Default` is equivalent to `Host` unless specified different by envrionment `MSCCLPP_IBV_MODE`. `Host` corresponds to the previous implementation using RDMA atomics for signaling, while `HostNoAtomic` uses write-with-immediate instead. * Regarding updates in Python bindings and API. --- include/mscclpp/core.hpp | 30 +++- include/mscclpp/env.hpp | 6 + python/csrc/core_py.cpp | 21 ++- python/csrc/env_py.cpp | 1 + python/mscclpp/__init__.py | 4 + src/core/communicator.cc | 1 - src/core/connection.cc | 157 +++++++++++++++-- src/core/endpoint.cc | 28 ++- src/core/env.cpp | 2 + src/core/gpu_utils.cc | 43 +---- src/core/ib.cc | 220 +++++++++++++++++------- src/core/include/connection.hpp | 35 ++++ src/core/include/endpoint.hpp | 2 + src/core/include/gpu_utils_internal.hpp | 64 +++++++ src/core/include/ib.hpp | 95 ++++++---- src/core/include/ibverbs_wrapper.hpp | 4 + src/core/semaphore.cc | 5 +- test/mp_unit/ib_tests.cu | 41 ++--- test/mp_unit/mp_unit_tests.hpp | 18 +- test/mp_unit/port_channel_tests.cu | 93 +++++++--- 20 files changed, 648 insertions(+), 222 deletions(-) create mode 100644 src/core/include/gpu_utils_internal.hpp diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp index 38b05ccf..37bdbd51 100644 --- a/include/mscclpp/core.hpp +++ b/include/mscclpp/core.hpp @@ -381,11 +381,19 @@ struct EndpointConfig { /// These settings are only used when the transport is an InfiniBand type (IB0-IB7); they are ignored for other /// transports. struct Ib { + /// IB mode for signaling, used to select between different implementations. + enum class Mode { + Default, // Use the MSCCLPP_IBV_MODE environment variable (or "host" if unset). + Host, // Use the host stack with RDMA atomics. + HostNoAtomic // Use the host stack with write-with-immediate signaling (no RDMA atomics). + }; + static constexpr int DefaultPort = -1; static constexpr int DefaultGidIndex = 0; static constexpr int DefaultMaxCqSize = 1024; static constexpr int DefaultMaxCqPollNum = 1; static constexpr int DefaultMaxSendWr = 8192; + static constexpr int DefaultMaxRecvWr = 16; static constexpr int DefaultMaxWrPerSend = 64; /// Device index. Currently ignored; use transport type (IB0-IB7) to select device. @@ -394,32 +402,41 @@ struct EndpointConfig { int port; /// GID index. int gidIndex; - /// Maximum size of the completion queue. + /// Maximum size of the send completion queue. int maxCqSize; - /// Maximum number of completion queue polls per operation. + /// Maximum number of send completion queue polls per operation. int maxCqPollNum; /// Maximum number of outstanding send work requests. int maxSendWr; + /// Maximum number of outstanding receive work requests (used in HostNoAtomic mode for write-with-immediate). + int maxRecvWr; /// Maximum number of work requests per send operation. int maxWrPerSend; + /// IB mode for signaling. When set to Default, uses the MSCCLPP_IBV_MODE environment variable. + Mode mode; /// Constructor. /// @param deviceIndex Device index. /// @param port Port number. /// @param gidIndex GID index. - /// @param maxCqSize Maximum completion queue size. - /// @param maxCqPollNum Maximum completion queue poll count. + /// @param maxCqSize Maximum send completion queue size. + /// @param maxCqPollNum Maximum send completion queue poll count. /// @param maxSendWr Maximum outstanding send work requests. + /// @param maxRecvWr Maximum outstanding receive work requests (for HostNoAtomic mode). /// @param maxWrPerSend Maximum work requests per send operation. + /// @param mode IB mode for signaling (Default uses MSCCLPP_IBV_MODE env variable). Ib(int deviceIndex = -1, int port = DefaultPort, int gidIndex = DefaultGidIndex, int maxCqSize = DefaultMaxCqSize, - int maxCqPollNum = DefaultMaxCqPollNum, int maxSendWr = DefaultMaxSendWr, int maxWrPerSend = DefaultMaxWrPerSend) + int maxCqPollNum = DefaultMaxCqPollNum, int maxSendWr = DefaultMaxSendWr, int maxRecvWr = DefaultMaxRecvWr, + int maxWrPerSend = DefaultMaxWrPerSend, Mode mode = Mode::Default) : deviceIndex(deviceIndex), port(port), gidIndex(gidIndex), maxCqSize(maxCqSize), maxCqPollNum(maxCqPollNum), maxSendWr(maxSendWr), - maxWrPerSend(maxWrPerSend) {} + maxRecvWr(maxRecvWr), + maxWrPerSend(maxWrPerSend), + mode(mode) {} }; /// Communication transport type (e.g., CudaIpc, IB0-IB7, Ethernet). @@ -658,6 +675,7 @@ class Connection { friend class SemaphoreStub; friend class Semaphore; friend class ProxyService; + friend class BaseConnection; }; /// SemaphoreStub object only used for constructing Semaphore, not for direct use by the user. diff --git a/include/mscclpp/env.hpp b/include/mscclpp/env.hpp index 9d78cd1a..bd3983e9 100644 --- a/include/mscclpp/env.hpp +++ b/include/mscclpp/env.hpp @@ -54,6 +54,12 @@ class Env { /// default libibverbs library found in the system. const std::string ibvSo; + /// Env name: `MSCCLPP_IBV_MODE`. Selects the IB stack implementation for PortChannel. + /// Allowed values: + /// - "host": use the host stack with RDMA atomics (default). + /// - "host-no-atomic": use the host stack with write-with-immediate signaling (no RDMA atomics). + const std::string ibvMode; + /// Env name: `MSCCLPP_HOSTID`. A string that uniquely identifies the host. If unset, it will use the hostname. /// This is used to determine whether the host is the same across different processes. const std::string hostid; diff --git a/python/csrc/core_py.cpp b/python/csrc/core_py.cpp index 9f085675..a862c7e5 100644 --- a/python/csrc/core_py.cpp +++ b/python/csrc/core_py.cpp @@ -147,22 +147,31 @@ void register_core(nb::module_& m) { return ss.str(); }); + nb::enum_(m, "CppIbMode") + .value("Default", EndpointConfig::Ib::Mode::Default) + .value("Host", EndpointConfig::Ib::Mode::Host) + .value("HostNoAtomic", EndpointConfig::Ib::Mode::HostNoAtomic); + nb::class_(m, "CppEndpointConfigIb") .def(nb::init<>()) - .def(nb::init(), nb::arg("device_index") = -1, + .def(nb::init(), nb::arg("device_index") = -1, nb::arg("port") = EndpointConfig::Ib::DefaultPort, nb::arg("gid_index") = EndpointConfig::Ib::DefaultGidIndex, nb::arg("max_cq_size") = EndpointConfig::Ib::DefaultMaxCqSize, nb::arg("max_cq_poll_num") = EndpointConfig::Ib::DefaultMaxCqPollNum, nb::arg("max_send_wr") = EndpointConfig::Ib::DefaultMaxSendWr, - nb::arg("max_wr_per_send") = EndpointConfig::Ib::DefaultMaxWrPerSend) + nb::arg("max_recv_wr") = EndpointConfig::Ib::DefaultMaxRecvWr, + nb::arg("max_wr_per_send") = EndpointConfig::Ib::DefaultMaxWrPerSend, + nb::arg("mode") = EndpointConfig::Ib::Mode::Default) .def_rw("device_index", &EndpointConfig::Ib::deviceIndex) .def_rw("port", &EndpointConfig::Ib::port) .def_rw("gid_index", &EndpointConfig::Ib::gidIndex) .def_rw("max_cq_size", &EndpointConfig::Ib::maxCqSize) .def_rw("max_cq_poll_num", &EndpointConfig::Ib::maxCqPollNum) .def_rw("max_send_wr", &EndpointConfig::Ib::maxSendWr) - .def_rw("max_wr_per_send", &EndpointConfig::Ib::maxWrPerSend); + .def_rw("max_recv_wr", &EndpointConfig::Ib::maxRecvWr) + .def_rw("max_wr_per_send", &EndpointConfig::Ib::maxWrPerSend) + .def_rw("mode", &EndpointConfig::Ib::mode); nb::class_(m, "CppRegisteredMemory") .def(nb::init<>()) @@ -223,9 +232,15 @@ void register_core(nb::module_& m) { .def_prop_rw( "ib_max_send_wr", [](EndpointConfig& self) { return self.ib.maxSendWr; }, [](EndpointConfig& self, int v) { self.ib.maxSendWr = v; }) + .def_prop_rw( + "ib_max_recv_wr", [](EndpointConfig& self) { return self.ib.maxRecvWr; }, + [](EndpointConfig& self, int v) { self.ib.maxRecvWr = v; }) .def_prop_rw( "ib_max_wr_per_send", [](EndpointConfig& self) { return self.ib.maxWrPerSend; }, [](EndpointConfig& self, int v) { self.ib.maxWrPerSend = v; }) + .def_prop_rw( + "ib_mode", [](EndpointConfig& self) { return self.ib.mode; }, + [](EndpointConfig& self, EndpointConfig::Ib::Mode v) { self.ib.mode = v; }) .def_rw("max_write_queue_size", &EndpointConfig::maxWriteQueueSize); nb::class_(m, "CppContext") diff --git a/python/csrc/env_py.cpp b/python/csrc/env_py.cpp index 360acc6f..ce89fd3d 100644 --- a/python/csrc/env_py.cpp +++ b/python/csrc/env_py.cpp @@ -20,6 +20,7 @@ void register_env(nb::module_& m) { .def_ro("socket_family", &Env::socketFamily) .def_ro("socket_ifname", &Env::socketIfname) .def_ro("comm_id", &Env::commId) + .def_ro("ibv_mode", &Env::ibvMode) .def_ro("cache_dir", &Env::cacheDir) .def_ro("npkit_dump_dir", &Env::npkitDumpDir) .def_ro("cuda_ipc_use_default_stream", &Env::cudaIpcUseDefaultStream); diff --git a/python/mscclpp/__init__.py b/python/mscclpp/__init__.py index 86923003..5f3a2302 100644 --- a/python/mscclpp/__init__.py +++ b/python/mscclpp/__init__.py @@ -29,6 +29,8 @@ from ._mscclpp import ( CppConnection as Connection, connect_nvls_collective, CppEndpointConfig as EndpointConfig, + CppEndpointConfigIb as EndpointConfigIb, + CppIbMode as IbMode, CppFifo as Fifo, CppSemaphore as Semaphore, CppHost2DeviceSemaphore as Host2DeviceSemaphore, @@ -61,6 +63,8 @@ __all__ = [ "Connection", "connect_nvls_collective", "EndpointConfig", + "EndpointConfigIb", + "IbMode", "ErrorCode", "Fifo", "Semaphore", diff --git a/src/core/communicator.cc b/src/core/communicator.cc index a146f0de..c95ca421 100644 --- a/src/core/communicator.cc +++ b/src/core/communicator.cc @@ -4,7 +4,6 @@ #include "communicator.hpp" #include "api.h" -#include "debug.h" namespace mscclpp { diff --git a/src/core/connection.cc b/src/core/connection.cc index 10a43e88..6466ca2a 100644 --- a/src/core/connection.cc +++ b/src/core/connection.cc @@ -7,7 +7,7 @@ #include #endif -#include +#include #include #include #include @@ -15,6 +15,7 @@ #include "api.h" #include "context.hpp" #include "endpoint.hpp" +#include "gpu_utils_internal.hpp" #include "logger.hpp" namespace mscclpp { @@ -180,25 +181,131 @@ void CudaIpcConnection::flush(int64_t timeoutUsec) { // IBConnection +void IBConnection::recvThreadFunc() { + // Set the CUDA device context for this thread + if (localGpuDeviceId_ >= 0) { + cudaError_t err = cudaSetDevice(localGpuDeviceId_); + if (err != cudaSuccess) { + WARN(NET, "IBConnection recvThreadFunc: cudaSetDevice(", localGpuDeviceId_, + ") failed: ", cudaGetErrorString(err)); + return; + } + // Bind this thread to the NUMA node of the local GPU for optimal memory access + int deviceNumaNode = getDeviceNumaNode(localGpuDeviceId_); + if (deviceNumaNode >= 0) { + numaBind(deviceNumaNode); + } + } + + // Host-side buffer to receive newValue from imm_data (need 64-bit for cudaMemcpy) + uint64_t newValueHost = 0; + + while (!stopRecvThread_.load(std::memory_order_relaxed)) { + auto qp = qp_.lock(); + if (!qp) break; + + int wcNum = qp->pollRecvCq(); + if (wcNum < 0) { + WARN(NET, "IBConnection recvThreadFunc: pollRecvCq failed"); + break; + } + + for (int i = 0; i < wcNum; ++i) { + int status = qp->getRecvWcStatus(i); + if (status != static_cast(WsStatus::Success)) { + WARN(NET, "IBConnection recvThreadFunc: recv work completion failed: ", qp->getRecvWcStatusString(i)); + // Post another recv to replace the failed one + qp->stageRecv(/*wrId=*/0); + qp->postRecv(); + continue; + } + + // The imm_data contains newValue (32-bit, extended to 64-bit) + // Note: getRecvWcImmData already converts from network byte order via ntohl + unsigned int immData = qp->getRecvWcImmData(i); + newValueHost = static_cast(immData); + + // Read dstGpuAddr from the local stored address (set by setRemoteUpdateDstAddr) + uint64_t dstGpuAddr = remoteUpdateDstAddr_; + if (dstGpuAddr != 0) { + uint64_t* dstPtr = reinterpret_cast(dstGpuAddr); + + // Use cudaMemcpyAsync with our dedicated stream to avoid blocking on the default stream + MSCCLPP_CUDATHROW( + cudaMemcpyAsync(dstPtr, &newValueHost, sizeof(uint64_t), cudaMemcpyHostToDevice, signalStream_)); + + INFO(CONN, "IBConnection recvThreadFunc: updated GPU ptr ", dstPtr, " to ", newValueHost, " (immData=", immData, + ")"); + } + + // Post another recv for future messages + qp->stageRecv(/*wrId=*/0); + qp->postRecv(); + } + } +} + IBConnection::IBConnection(std::shared_ptr context, const Endpoint& localEndpoint, const Endpoint& remoteEndpoint) : BaseConnection(context, localEndpoint), transport_(localEndpoint.transport()), remoteTransport_(remoteEndpoint.transport()), - dummyAtomicSource_(std::make_unique(0)) { + dummyAtomicSource_(std::make_unique(0)), + ibNoAtomic_(getImpl(localEndpoint).ibNoAtomic_), + stopRecvThread_(false), + localGpuDeviceId_(localEndpoint.device().id), + signalStream_(nullptr), + remoteUpdateDstAddr_(0) { qp_ = getImpl(localEndpoint).ibQp_; qp_.lock()->rtr(getImpl(remoteEndpoint).ibQpInfo_); qp_.lock()->rts(); dummyAtomicSourceMem_ = context->registerMemory(dummyAtomicSource_.get(), sizeof(uint64_t), transport_); validateTransport(dummyAtomicSourceMem_, transport_); dstTransportInfo_ = getImpl(dummyAtomicSourceMem_).getTransportInfo(transport_); - INFO(CONN, "IBConnection via ", getIBDeviceName(transport_), " created"); + + if (ibNoAtomic_) { + // Create a CUDA stream for async memory copies + MSCCLPP_CUDATHROW(cudaStreamCreateWithFlags(&signalStream_, cudaStreamNonBlocking)); + + // Pre-post receive requests for incoming write-with-imm + auto qp = qp_.lock(); + int maxRecvWr = localEndpoint.config().ib.maxRecvWr; + for (int i = 0; i < maxRecvWr; ++i) { + qp->stageRecv(/*wrId=*/0); + } + qp->postRecv(); + // Start the background thread to poll recv CQ + recvThread_ = std::thread([this]() { this->recvThreadFunc(); }); + INFO(CONN, "IBConnection via ", getIBDeviceName(transport_), " created with no-atomic mode"); + } else { + INFO(CONN, "IBConnection via ", getIBDeviceName(transport_), " created with atomic mode"); + } +} + +IBConnection::~IBConnection() { + if (ibNoAtomic_) { + stopRecvThread_.store(true, std::memory_order_relaxed); + if (recvThread_.joinable()) { + recvThread_.join(); + } + if (signalStream_ != nullptr) { + // Synchronize stream to ensure all async copies are complete before destruction + // Ignore errors during teardown (CUDA context may already be destroyed) + MSCCLPP_CUDATHROW_IGNORE_TEARDOWN(cudaStreamSynchronize(signalStream_)); + MSCCLPP_CUDATHROW_IGNORE_TEARDOWN(cudaStreamDestroy(signalStream_)); + } + } } Transport IBConnection::transport() const { return transport_; } Transport IBConnection::remoteTransport() const { return remoteTransport_; } +void IBConnection::setRemoteUpdateDstAddr(uint64_t addr) { + remoteUpdateDstAddr_ = addr; + INFO(CONN, "IBConnection setRemoteUpdateDstAddr: ", (void*)addr); +} + void IBConnection::write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMemory src, uint64_t srcOffset, uint64_t size) { #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_CONN_IB_WRITE_ENTRY) @@ -220,8 +327,8 @@ void IBConnection::write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMem auto dstMrInfo = dstTransportInfo.ibMrInfo; auto srcMr = srcTransportInfo.ibMr; - qp_.lock()->stageSend(srcMr, dstMrInfo, (uint32_t)size, /*wrId=*/0, /*srcOffset=*/srcOffset, /*dstOffset=*/dstOffset, - /*signaled=*/true); + qp_.lock()->stageSendWrite(srcMr, dstMrInfo, (uint32_t)size, /*wrId=*/0, /*srcOffset=*/srcOffset, + /*dstOffset=*/dstOffset, /*signaled=*/true); qp_.lock()->postSend(); INFO(CONN, "IBConnection write: from ", (uint8_t*)srcMr->getBuff() + srcOffset, " to ", @@ -248,12 +355,28 @@ void IBConnection::updateAndSync(RegisteredMemory dst, uint64_t dstOffset, uint6 uint64_t oldValue = *src; *src = newValue; - qp_.lock()->stageAtomicAdd(dstTransportInfo_.ibMr, dstMrInfo, /*wrId=*/0, dstOffset, newValue - oldValue, - /*signaled=*/true); + if (ibNoAtomic_) { + // Use RDMA write-with-imm instead of atomic operation + // Send only newValue in imm_data (0-byte write) + // The remote's recvThreadFunc will use its stored remoteUpdateDstAddr_ to write - qp_.lock()->postSend(); - INFO(CONN, "IBConnection atomic Write: from ", src, " to ", (uint8_t*)dstMrInfo.addr + dstOffset, ", ", oldValue, - " -> ", newValue); + // Put newValue in imm_data (truncated to 32-bit; semaphore counters should fit) + unsigned int immData = static_cast(newValue); + + // Send 0-byte write-with-imm; use dstMrInfo as target (we don't actually write anything) + qp_.lock()->stageSendWriteWithImm(nullptr, dstMrInfo, + /*size=*/0, /*wrId=*/0, + /*srcOffset=*/0, /*dstOffset=*/0, + /*signaled=*/true, /*immData=*/immData); + qp_.lock()->postSend(); + INFO(CONN, "IBConnection write-with-imm: value ", oldValue, " -> ", newValue); + } else { + qp_.lock()->stageSendAtomicAdd(dstTransportInfo_.ibMr, dstMrInfo, /*wrId=*/0, dstOffset, newValue - oldValue, + /*signaled=*/true); + qp_.lock()->postSend(); + INFO(CONN, "IBConnection atomic Write: from ", src, " to ", (uint8_t*)dstMrInfo.addr + dstOffset, ", ", oldValue, + " -> ", newValue); + } #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_CONN_IB_UPDATE_AND_SYNC_EXIT) NpKit::CollectCpuEvent(NPKIT_EVENT_CONN_IB_UPDATE_AND_SYNC_EXIT, 0, 0, *NpKit::GetCpuTimestamp(), 0); @@ -266,21 +389,21 @@ void IBConnection::flush(int64_t timeoutUsec) { #endif Timer timer; - while (qp_.lock()->getNumCqItems()) { - int wcNum = qp_.lock()->pollCq(); + while (qp_.lock()->getNumSendCqItems()) { + int wcNum = qp_.lock()->pollSendCq(); if (wcNum < 0) { - THROW(NET, IbError, errno, "pollCq failed"); + THROW(NET, IbError, errno, "pollSendCq failed"); } else if (timeoutUsec >= 0) { auto elapsed = timer.elapsed(); if (elapsed > timeoutUsec) { - THROW(CONN, Error, ErrorCode::Timeout, "pollCq timed out: waited for ", elapsed / 1e6, " seconds. Expected ", - qp_.lock()->getNumCqItems(), " signals"); + THROW(CONN, Error, ErrorCode::Timeout, "pollSendCq timed out: waited for ", elapsed / 1e6, + " seconds. Expected ", qp_.lock()->getNumSendCqItems(), " signals"); } } for (int i = 0; i < wcNum; ++i) { - int status = qp_.lock()->getWcStatus(i); + int status = qp_.lock()->getSendWcStatus(i); if (status != static_cast(WsStatus::Success)) { - THROW(NET, Error, ErrorCode::SystemError, "an IB work item failed: ", qp_.lock()->getWcStatusString(i)); + THROW(NET, Error, ErrorCode::SystemError, "an IB work item failed: ", qp_.lock()->getSendWcStatusString(i)); } } } diff --git a/src/core/endpoint.cc b/src/core/endpoint.cc index 3833fdc4..4795aa62 100644 --- a/src/core/endpoint.cc +++ b/src/core/endpoint.cc @@ -4,9 +4,13 @@ #include "endpoint.hpp" #include +#include #include "api.h" #include "context.hpp" +#include "ib.hpp" +#include "logger.hpp" +#include "registered_memory.hpp" #include "serialization.hpp" #include "socket.h" #include "utils_internal.hpp" @@ -23,9 +27,31 @@ Endpoint::Impl::Impl(const EndpointConfig& config, Context::Impl& contextImpl) if (config_.maxWriteQueueSize <= 0) { config_.maxWriteQueueSize = config_.ib.maxCqSize; } + + // Determine if we should use no-atomics mode + ibNoAtomic_ = false; + if (config_.ib.mode == EndpointConfig::Ib::Mode::HostNoAtomic) { + ibNoAtomic_ = true; + } else if (config_.ib.mode == EndpointConfig::Ib::Mode::Default) { + // Use environment variable when mode is Default + ibNoAtomic_ = (env()->ibvMode == "host-no-atomic"); + } + + // If mode is Host (or Default resolved to host), check if atomics are supported + if (!ibNoAtomic_) { + IbCtx* ibCtx = contextImpl.getIbContext(config_.transport); + if (!ibCtx->supportsRdmaAtomics()) { + WARN(NET, "IB device ", ibCtx->getDevName(), + " does not support RDMA atomics. Falling back to write-with-immediate mode (HostNoAtomic)."); + ibNoAtomic_ = true; + } + } + + int maxRecvWr = ibNoAtomic_ ? config_.ib.maxRecvWr : 0; + ibQp_ = contextImpl.getIbContext(config_.transport) ->createQp(config_.ib.port, config_.ib.gidIndex, config_.ib.maxCqSize, config_.ib.maxCqPollNum, - config_.ib.maxSendWr, 0, config_.ib.maxWrPerSend); + config_.ib.maxSendWr, maxRecvWr, config_.ib.maxWrPerSend); ibQpInfo_ = ibQp_->getInfo(); } else if (config_.transport == Transport::Ethernet) { // Configuring Ethernet Interfaces diff --git a/src/core/env.cpp b/src/core/env.cpp index 508208e9..a70e3d28 100644 --- a/src/core/env.cpp +++ b/src/core/env.cpp @@ -54,6 +54,7 @@ Env::Env() logFile(readEnv("MSCCLPP_LOG_FILE", "")), hcaDevices(readEnv("MSCCLPP_HCA_DEVICES", "")), ibvSo(readEnv("MSCCLPP_IBV_SO", "")), + ibvMode(readEnv("MSCCLPP_IBV_MODE", "host")), hostid(readEnv("MSCCLPP_HOSTID", "")), socketFamily(readEnv("MSCCLPP_SOCKET_FAMILY", "")), socketIfname(readEnv("MSCCLPP_SOCKET_IFNAME", "")), @@ -80,6 +81,7 @@ std::shared_ptr env() { logEnv("MSCCLPP_LOG_FILE", globalEnv->logFile); logEnv("MSCCLPP_HCA_DEVICES", globalEnv->hcaDevices); logEnv("MSCCLPP_IBV_SO", globalEnv->ibvSo); + logEnv("MSCCLPP_IBV_MODE", globalEnv->ibvMode); logEnv("MSCCLPP_HOSTID", globalEnv->hostid); logEnv("MSCCLPP_SOCKET_FAMILY", globalEnv->socketFamily); logEnv("MSCCLPP_SOCKET_IFNAME", globalEnv->socketIfname); diff --git a/src/core/gpu_utils.cc b/src/core/gpu_utils.cc index 3aa6aa1c..628d2dcb 100644 --- a/src/core/gpu_utils.cc +++ b/src/core/gpu_utils.cc @@ -5,48 +5,7 @@ #include #include -#include "debug.h" - -static inline bool isCudaTeardownError(cudaError_t err) { -#if defined(MSCCLPP_USE_ROCM) - return err == cudaErrorContextIsDestroyed || err == cudaErrorInvalidDevice; -#else // !defined(MSCCLPP_USE_ROCM) - return err == cudaErrorCudartUnloading || err == cudaErrorContextIsDestroyed || err == cudaErrorInitializationError || - err == cudaErrorInvalidDevice || err == cudaErrorLaunchFailure || err == cudaErrorDeviceUninitialized; -#endif // !defined(MSCCLPP_USE_ROCM) -} - -[[maybe_unused]] static inline bool isCuTeardownError(CUresult r) { - return r == CUDA_ERROR_DEINITIALIZED || r == CUDA_ERROR_CONTEXT_IS_DESTROYED || r == CUDA_ERROR_LAUNCH_FAILED; -} - -#define MSCCLPP_CUDATHROW_IGNORE_TEARDOWN(cmd) \ - do { \ - cudaError_t __e = cmd; \ - if (isCudaTeardownError(__e)) { \ - (void)cudaGetLastError(); \ - } else { \ - MSCCLPP_CUDATHROW(__e); \ - } \ - } while (false) - -#define MSCCLPP_CUTHROW_IGNORE_TEARDOWN(cmd) \ - do { \ - CUresult __e = cmd; \ - if (!isCuTeardownError(__e)) { \ - MSCCLPP_CUTHROW(__e); \ - } \ - } while (false) - -#define MSCCLPP_CUTHROW_IGNORE(cmd) \ - do { \ - CUresult __e = cmd; \ - if (__e != CUDA_SUCCESS) { \ - const char* errStr; \ - cuGetErrorString(__e, &errStr); \ - WARN("%s:%d Cuda failure %d '%s'", __FILE__, __LINE__, __e, errStr); \ - } \ - } while (false) +#include "gpu_utils_internal.hpp" namespace mscclpp { diff --git a/src/core/ib.cc b/src/core/ib.cc index 9b86cdf1..2e7b867d 100644 --- a/src/core/ib.cc +++ b/src/core/ib.cc @@ -3,6 +3,7 @@ #include "ib.hpp" +#include #include #include @@ -129,30 +130,46 @@ const void* IbMr::getBuff() const { return buff_; } uint32_t IbMr::getLkey() const { return mr_->lkey; } -IbQp::IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxCqSize, int maxCqPollNum, int maxSendWr, - int maxRecvWr, int maxWrPerSend) +IbQp::IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxSendCqSize, int maxSendCqPollNum, + int maxSendWr, int maxRecvWr, int maxWrPerSend) : portNum_(portNum), gidIndex_(gidIndex), info_(), qp_(nullptr), - cq_(nullptr), - wcs_(), - wrs_(), - sges_(), - wrn_(0), - numSignaledPostedItems_(0), - numSignaledStagedItems_(0), - maxCqPollNum_(maxCqPollNum), - maxWrPerSend_(maxWrPerSend) { - cq_ = IBVerbs::ibv_create_cq(ctx, maxCqSize, nullptr, nullptr, 0); - if (cq_ == nullptr) { + sendCq_(nullptr), + recvCq_(nullptr), + sendWcs_(), + recvWcs_(), + sendWrs_(), + sendSges_(), + recvWrs_(), + recvSges_(), + numStagedSend_(0), + numStagedRecv_(0), + numPostedSignaledSend_(0), + numStagedSignaledSend_(0), + maxSendCqPollNum_(maxSendCqPollNum), + maxSendWr_(maxSendWr), + maxWrPerSend_(maxWrPerSend), + maxRecvWr_(maxRecvWr) { + sendCq_ = IBVerbs::ibv_create_cq(ctx, maxSendCqSize, nullptr, nullptr, 0); + if (sendCq_ == nullptr) { THROW(NET, IbError, errno, "ibv_create_cq failed (errno ", errno, ")"); } + // Only create recv CQ if maxRecvWr > 0 + if (maxRecvWr > 0) { + recvCq_ = IBVerbs::ibv_create_cq(ctx, maxRecvWr, nullptr, nullptr, 0); + if (recvCq_ == nullptr) { + THROW(NET, IbError, errno, "ibv_create_cq failed (errno ", errno, ")"); + } + } + struct ibv_qp_init_attr qpInitAttr = {}; qpInitAttr.sq_sig_all = 0; - qpInitAttr.send_cq = cq_; - qpInitAttr.recv_cq = cq_; + qpInitAttr.send_cq = sendCq_; + // Use separate recv CQ if created, otherwise use the send CQ + qpInitAttr.recv_cq = (recvCq_ != nullptr) ? recvCq_ : sendCq_; qpInitAttr.qp_type = IBV_QPT_RC; qpInitAttr.cap.max_send_wr = maxSendWr; qpInitAttr.cap.max_recv_wr = maxRecvWr; @@ -173,9 +190,9 @@ IbQp::IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxCqSiz info_.linkLayer = portAttr.link_layer; info_.qpn = qp->qp_num; info_.mtu = portAttr.active_mtu; - info_.is_grh = (portAttr.flags & IBV_QPF_GRH_REQUIRED); + info_.isGrh = (portAttr.flags & IBV_QPF_GRH_REQUIRED); - if (portAttr.link_layer != IBV_LINK_LAYER_INFINIBAND || info_.is_grh) { + if (portAttr.link_layer != IBV_LINK_LAYER_INFINIBAND || info_.isGrh) { if (gidIndex_ >= portAttr.gid_tbl_len) { THROW(NET, Error, ErrorCode::InvalidUsage, "invalid GID index ", gidIndex_, " for port ", portNum_, " (max index is ", portAttr.gid_tbl_len - 1, ")"); @@ -199,14 +216,22 @@ IbQp::IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxCqSiz THROW(NET, IbError, errno, "ibv_modify_qp failed (errno ", errno, ")"); } qp_ = qp; - wrs_ = std::make_shared>(maxWrPerSend_); - sges_ = std::make_shared>(maxWrPerSend_); - wcs_ = std::make_shared>(maxCqPollNum_); + sendWrs_ = std::make_shared>(maxWrPerSend_); + sendSges_ = std::make_shared>(maxWrPerSend_); + sendWcs_ = std::make_shared>(maxSendCqPollNum_); + recvWcs_ = std::make_shared>(maxRecvWr_); + if (maxRecvWr_ > 0) { + recvWrs_ = std::make_shared>(maxRecvWr_); + recvSges_ = std::make_shared>(maxRecvWr_); + } } IbQp::~IbQp() { IBVerbs::ibv_destroy_qp(qp_); - IBVerbs::ibv_destroy_cq(cq_); + IBVerbs::ibv_destroy_cq(sendCq_); + if (recvCq_ != nullptr) { + IBVerbs::ibv_destroy_cq(recvCq_); + } } void IbQp::rtr(const IbQpInfo& info) { @@ -217,7 +242,7 @@ void IbQp::rtr(const IbQpInfo& info) { qp_attr.rq_psn = 0; qp_attr.max_dest_rd_atomic = 1; qp_attr.min_rnr_timer = 0x12; - if (info.linkLayer == IBV_LINK_LAYER_ETHERNET || info.is_grh) { + if (info.linkLayer == IBV_LINK_LAYER_ETHERNET || info.isGrh) { qp_attr.ah_attr.is_global = 1; qp_attr.ah_attr.grh.dgid.global.subnet_prefix = info.spn; qp_attr.ah_attr.grh.dgid.global.interface_id = info.iid; @@ -256,25 +281,25 @@ void IbQp::rts() { } } -IbQp::WrInfo IbQp::getNewWrInfo() { - if (wrn_ >= maxWrPerSend_) { - THROW(NET, Error, ErrorCode::InvalidUsage, "too many outstanding work requests. limit is ", maxWrPerSend_); +IbQp::SendWrInfo IbQp::getNewSendWrInfo() { + if (numStagedSend_ >= maxWrPerSend_) { + THROW(NET, Error, ErrorCode::InvalidUsage, "too many staged work requests. limit is ", maxWrPerSend_); } - ibv_send_wr* wr_ = &wrs_->data()[wrn_]; - ibv_sge* sge_ = &sges_->data()[wrn_]; + ibv_send_wr* wr_ = &sendWrs_->data()[numStagedSend_]; + ibv_sge* sge_ = &sendSges_->data()[numStagedSend_]; wr_->sg_list = sge_; wr_->num_sge = 1; wr_->next = nullptr; - if (wrn_ > 0) { - (*wrs_)[wrn_ - 1].next = wr_; + if (numStagedSend_ > 0) { + (*sendWrs_)[numStagedSend_ - 1].next = wr_; } - wrn_++; - return IbQp::WrInfo{wr_, sge_}; + numStagedSend_++; + return IbQp::SendWrInfo{wr_, sge_}; } -void IbQp::stageSend(const IbMr* mr, const IbMrInfo& info, uint32_t size, uint64_t wrId, uint64_t srcOffset, - uint64_t dstOffset, bool signaled) { - auto wrInfo = this->getNewWrInfo(); +void IbQp::stageSendWrite(const IbMr* mr, const IbMrInfo& info, uint32_t size, uint64_t wrId, uint64_t srcOffset, + uint64_t dstOffset, bool signaled) { + auto wrInfo = this->getNewSendWrInfo(); wrInfo.wr->wr_id = wrId; wrInfo.wr->opcode = IBV_WR_RDMA_WRITE; wrInfo.wr->send_flags = signaled ? IBV_SEND_SIGNALED : 0; @@ -283,12 +308,12 @@ void IbQp::stageSend(const IbMr* mr, const IbMrInfo& info, uint32_t size, uint64 wrInfo.sge->addr = (uint64_t)(mr->getBuff()) + srcOffset; wrInfo.sge->length = size; wrInfo.sge->lkey = mr->getLkey(); - if (signaled) numSignaledStagedItems_++; + if (signaled) numStagedSignaledSend_++; } -void IbQp::stageAtomicAdd(const IbMr* mr, const IbMrInfo& info, uint64_t wrId, uint64_t dstOffset, uint64_t addVal, - bool signaled) { - auto wrInfo = this->getNewWrInfo(); +void IbQp::stageSendAtomicAdd(const IbMr* mr, const IbMrInfo& info, uint64_t wrId, uint64_t dstOffset, uint64_t addVal, + bool signaled) { + auto wrInfo = this->getNewSendWrInfo(); wrInfo.wr->wr_id = wrId; wrInfo.wr->opcode = IBV_WR_ATOMIC_FETCH_AND_ADD; wrInfo.wr->send_flags = signaled ? IBV_SEND_SIGNALED : 0; @@ -298,57 +323,118 @@ void IbQp::stageAtomicAdd(const IbMr* mr, const IbMrInfo& info, uint64_t wrId, u wrInfo.sge->addr = (uint64_t)(mr->getBuff()); wrInfo.sge->length = sizeof(uint64_t); // atomic op is always on uint64_t wrInfo.sge->lkey = mr->getLkey(); - if (signaled) numSignaledStagedItems_++; + if (signaled) numStagedSignaledSend_++; } -void IbQp::stageSendWithImm(const IbMr* mr, const IbMrInfo& info, uint32_t size, uint64_t wrId, uint64_t srcOffset, - uint64_t dstOffset, bool signaled, unsigned int immData) { - auto wrInfo = this->getNewWrInfo(); +void IbQp::stageSendWriteWithImm(const IbMr* mr, const IbMrInfo& info, uint32_t size, uint64_t wrId, uint64_t srcOffset, + uint64_t dstOffset, bool signaled, unsigned int immData) { + auto wrInfo = this->getNewSendWrInfo(); wrInfo.wr->wr_id = wrId; wrInfo.wr->opcode = IBV_WR_RDMA_WRITE_WITH_IMM; wrInfo.wr->send_flags = signaled ? IBV_SEND_SIGNALED : 0; wrInfo.wr->wr.rdma.remote_addr = (uint64_t)(info.addr) + dstOffset; wrInfo.wr->wr.rdma.rkey = info.rkey; - wrInfo.wr->imm_data = immData; - wrInfo.sge->addr = (uint64_t)(mr->getBuff()) + srcOffset; - wrInfo.sge->length = size; - wrInfo.sge->lkey = mr->getLkey(); - if (signaled) numSignaledStagedItems_++; + wrInfo.wr->imm_data = htonl(immData); + if (mr != nullptr) { + wrInfo.sge->addr = (uint64_t)(mr->getBuff()) + srcOffset; + wrInfo.sge->length = size; + wrInfo.sge->lkey = mr->getLkey(); + } else { + // 0-byte write-with-imm: no source buffer needed + wrInfo.sge->addr = 0; + wrInfo.sge->length = 0; + wrInfo.sge->lkey = 0; + } + if (signaled) numStagedSignaledSend_++; } void IbQp::postSend() { - if (wrn_ == 0) { + if (numStagedSend_ == 0) { return; } struct ibv_send_wr* bad_wr; - int err = IBVerbs::ibv_post_send(qp_, wrs_->data(), &bad_wr); + int err = IBVerbs::ibv_post_send(qp_, sendWrs_->data(), &bad_wr); if (err != 0) { THROW(NET, IbError, err, "ibv_post_send failed (errno ", err, ")"); } - wrn_ = 0; - numSignaledPostedItems_ += numSignaledStagedItems_; - numSignaledStagedItems_ = 0; - if (numSignaledPostedItems_ + 4 > cq_->cqe) { - WARN(NET, "IB: CQ is almost full ( ", numSignaledPostedItems_, " / ", cq_->cqe, + numStagedSend_ = 0; + numPostedSignaledSend_ += numStagedSignaledSend_; + numStagedSignaledSend_ = 0; + if (numPostedSignaledSend_ + 4 > sendCq_->cqe) { + WARN(NET, "IB: CQ is almost full ( ", numPostedSignaledSend_, " / ", sendCq_->cqe, " ). The connection needs to be flushed to prevent timeout errors."); } } -int IbQp::pollCq() { - int wcNum = IBVerbs::ibv_poll_cq(cq_, maxCqPollNum_, wcs_->data()); +IbQp::RecvWrInfo IbQp::getNewRecvWrInfo() { + if (numStagedRecv_ >= maxRecvWr_) { + THROW(NET, Error, ErrorCode::InvalidUsage, "too many outstanding recv work requests. limit is ", maxRecvWr_); + } + ibv_recv_wr* wr = &recvWrs_->data()[numStagedRecv_]; + ibv_sge* sge = &recvSges_->data()[numStagedRecv_]; + wr->next = nullptr; + if (numStagedRecv_ > 0) { + (*recvWrs_)[numStagedRecv_ - 1].next = wr; + } + numStagedRecv_++; + return IbQp::RecvWrInfo{wr, sge}; +} + +void IbQp::stageRecv(uint64_t wrId) { + auto wrInfo = this->getNewRecvWrInfo(); + // For RDMA write-with-imm, data goes to remote_addr specified by sender. + // We only need the recv WR to get the completion notification with imm_data. + wrInfo.wr->wr_id = wrId; + wrInfo.wr->sg_list = nullptr; + wrInfo.wr->num_sge = 0; +} + +void IbQp::stageRecv(const IbMr* mr, uint64_t wrId, uint32_t size, uint64_t offset) { + auto wrInfo = this->getNewRecvWrInfo(); + wrInfo.wr->wr_id = wrId; + wrInfo.sge->addr = reinterpret_cast(mr->getBuff()) + offset; + wrInfo.sge->length = size; + wrInfo.sge->lkey = mr->getLkey(); + wrInfo.wr->sg_list = wrInfo.sge; + wrInfo.wr->num_sge = 1; +} + +void IbQp::postRecv() { + if (numStagedRecv_ == 0) return; + struct ibv_recv_wr* bad_wr; + int err = IBVerbs::ibv_post_recv(qp_, recvWrs_->data(), &bad_wr); + if (err != 0) { + THROW(NET, IbError, err, "ibv_post_recv failed (errno ", err, ")"); + } + numStagedRecv_ = 0; +} + +int IbQp::pollSendCq() { + int wcNum = IBVerbs::ibv_poll_cq(sendCq_, maxSendCqPollNum_, sendWcs_->data()); if (wcNum > 0) { - numSignaledPostedItems_ -= wcNum; + numPostedSignaledSend_ -= wcNum; } return wcNum; } -int IbQp::getWcStatus(int idx) const { return (*wcs_)[idx].status; } +int IbQp::pollRecvCq() { + int wcNum = IBVerbs::ibv_poll_cq(recvCq_, maxRecvWr_, recvWcs_->data()); + return wcNum; +} -std::string IbQp::getWcStatusString(int idx) const { return IBVerbs::ibv_wc_status_str((*wcs_)[idx].status); } +int IbQp::getSendWcStatus(int idx) const { return (*sendWcs_)[idx].status; } -int IbQp::getNumCqItems() const { return numSignaledPostedItems_; } +std::string IbQp::getSendWcStatusString(int idx) const { return IBVerbs::ibv_wc_status_str((*sendWcs_)[idx].status); } -IbCtx::IbCtx(const std::string& devName) : devName_(devName), ctx_(nullptr), pd_(nullptr) { +int IbQp::getNumSendCqItems() const { return numPostedSignaledSend_; } + +int IbQp::getRecvWcStatus(int idx) const { return (*recvWcs_)[idx].status; } + +std::string IbQp::getRecvWcStatusString(int idx) const { return IBVerbs::ibv_wc_status_str((*recvWcs_)[idx].status); } + +unsigned int IbQp::getRecvWcImmData(int idx) const { return ntohl((*recvWcs_)[idx].imm_data); } + +IbCtx::IbCtx(const std::string& devName) : devName_(devName), ctx_(nullptr), pd_(nullptr), supportsRdmaAtomics_(false) { int num; struct ibv_device** devices = IBVerbs::ibv_get_device_list(&num); for (int i = 0; i < num; ++i) { @@ -365,6 +451,12 @@ IbCtx::IbCtx(const std::string& devName) : devName_(devName), ctx_(nullptr), pd_ if (pd_ == nullptr) { THROW(NET, IbError, errno, "ibv_alloc_pd failed (errno ", errno, ")"); } + + // Query and cache RDMA atomics capability + struct ibv_device_attr attr = {}; + if (IBVerbs::ibv_query_device(ctx_, &attr) == 0) { + supportsRdmaAtomics_ = (attr.atomic_cap == IBV_ATOMIC_HCA || attr.atomic_cap == IBV_ATOMIC_GLOB); + } } IbCtx::~IbCtx() { @@ -419,7 +511,7 @@ int IbCtx::getAnyUsablePort(int gidIndex) const { return -1; } -std::shared_ptr IbCtx::createQp(int port, int gidIndex, int maxCqSize, int maxCqPollNum, int maxSendWr, +std::shared_ptr IbCtx::createQp(int port, int gidIndex, int maxSendCqSize, int maxSendCqPollNum, int maxSendWr, int maxRecvWr, int maxWrPerSend) { if (port == -1) { port = this->getAnyUsablePort(gidIndex); @@ -430,13 +522,15 @@ std::shared_ptr IbCtx::createQp(int port, int gidIndex, int maxCqSize, int THROW(NET, Error, ErrorCode::InvalidUsage, "invalid IB port: ", port); } return std::shared_ptr( - new IbQp(ctx_, pd_, port, gidIndex, maxCqSize, maxCqPollNum, maxSendWr, maxRecvWr, maxWrPerSend)); + new IbQp(ctx_, pd_, port, gidIndex, maxSendCqSize, maxSendCqPollNum, maxSendWr, maxRecvWr, maxWrPerSend)); } std::unique_ptr IbCtx::registerMr(void* buff, std::size_t size) { return std::unique_ptr(new IbMr(pd_, buff, size)); } +bool IbCtx::supportsRdmaAtomics() const { return supportsRdmaAtomics_; } + MSCCLPP_API_CPP int getIBDeviceCount() { int num; IBVerbs::ibv_get_device_list(&num); diff --git a/src/core/include/connection.hpp b/src/core/include/connection.hpp index c9d81d41..06e733c7 100644 --- a/src/core/include/connection.hpp +++ b/src/core/include/connection.hpp @@ -4,11 +4,17 @@ #ifndef MSCCLPP_CONNECTION_HPP_ #define MSCCLPP_CONNECTION_HPP_ +#include #include #include +#include +#include +#include +#include #include "communicator.hpp" #include "context.hpp" +#include "endpoint.hpp" #include "ib.hpp" #include "registered_memory.hpp" #include "socket.h" @@ -29,6 +35,12 @@ class BaseConnection { virtual void flush(int64_t timeoutUsec = -1) = 0; + /// Set the local address where remote updateAndSync operations should write. + /// This is called by the receiver to specify where incoming signals should be written. + /// Default implementation is a no-op for connections that don't need it. + /// @param addr The local address for incoming writes. + virtual void setRemoteUpdateDstAddr(uint64_t /*addr*/) {} + virtual Transport transport() const = 0; virtual Transport remoteTransport() const = 0; @@ -39,6 +51,8 @@ class BaseConnection { int getMaxWriteQueueSize() const; + static std::shared_ptr& getImpl(Connection& conn) { return conn.impl_; } + protected: friend class Context; friend class CudaIpcConnection; @@ -81,8 +95,29 @@ class IBConnection : public BaseConnection { RegisteredMemory dummyAtomicSourceMem_; mscclpp::TransportInfo dstTransportInfo_; + // For write-with-imm mode (HostNoAtomic): uses RDMA write-with-imm to signal + // instead of atomic operations, with a host thread forwarding to GPU for memory consistency. + bool ibNoAtomic_; + std::thread recvThread_; + std::atomic stopRecvThread_; + int localGpuDeviceId_; // Local GPU device ID for setting CUDA context in recv thread + cudaStream_t signalStream_; + + // Write-with-imm design: + // - Sender: 0-byte RDMA write-with-imm to dst MR, newValue in imm_data (32-bit) + // - Receiver: uses remoteUpdateDstAddr_ (set via setRemoteUpdateDstAddr) to know where to write + uint64_t remoteUpdateDstAddr_; + + void recvThreadFunc(); + public: IBConnection(std::shared_ptr context, const Endpoint& localEndpoint, const Endpoint& remoteEndpoint); + ~IBConnection(); + + /// Set the local address where remote updateAndSync operations will write. + /// Must be called before the remote sends any updateAndSync in host-no-atomic mode. + /// @param addr The local address for incoming writes. + void setRemoteUpdateDstAddr(uint64_t addr) override; Transport transport() const override; diff --git a/src/core/include/endpoint.hpp b/src/core/include/endpoint.hpp index a3a5ad41..363faab1 100644 --- a/src/core/include/endpoint.hpp +++ b/src/core/include/endpoint.hpp @@ -4,6 +4,7 @@ #ifndef MSCCLPP_ENDPOINT_HPP_ #define MSCCLPP_ENDPOINT_HPP_ +#include #include #include @@ -24,6 +25,7 @@ struct Endpoint::Impl { // The following are only used for IB and are undefined for other transports. bool ibLocal_; + bool ibNoAtomic_; std::shared_ptr ibQp_; IbQpInfo ibQpInfo_; diff --git a/src/core/include/gpu_utils_internal.hpp b/src/core/include/gpu_utils_internal.hpp new file mode 100644 index 00000000..a7cea86b --- /dev/null +++ b/src/core/include/gpu_utils_internal.hpp @@ -0,0 +1,64 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +#ifndef MSCCLPP_GPU_UTILS_INTERNAL_HPP_ +#define MSCCLPP_GPU_UTILS_INTERNAL_HPP_ + +#include +#include + +#include "logger.hpp" + +namespace mscclpp { + +/// Check if a CUDA error indicates runtime teardown (safe to ignore in destructors). +inline bool isCudaTeardownError(cudaError_t err) { +#if defined(MSCCLPP_USE_ROCM) + return err == cudaErrorContextIsDestroyed || err == cudaErrorInvalidDevice; +#else // !defined(MSCCLPP_USE_ROCM) + return err == cudaErrorCudartUnloading || err == cudaErrorContextIsDestroyed || err == cudaErrorInitializationError || + err == cudaErrorInvalidDevice || err == cudaErrorLaunchFailure || err == cudaErrorDeviceUninitialized; +#endif // !defined(MSCCLPP_USE_ROCM) +} + +/// Check if a CUDA driver error indicates runtime teardown. +inline bool isCuTeardownError(CUresult r) { + return r == CUDA_ERROR_DEINITIALIZED || r == CUDA_ERROR_CONTEXT_IS_DESTROYED || r == CUDA_ERROR_LAUNCH_FAILED; +} + +} // namespace mscclpp + +/// Execute a CUDA runtime call and ignore teardown errors (useful in destructors). +/// Non-teardown errors will throw. +#define MSCCLPP_CUDATHROW_IGNORE_TEARDOWN(cmd) \ + do { \ + cudaError_t __e = cmd; \ + if (mscclpp::isCudaTeardownError(__e)) { \ + (void)cudaGetLastError(); \ + } else { \ + MSCCLPP_CUDATHROW(__e); \ + } \ + } while (false) + +/// Execute a CUDA driver call and ignore teardown errors (useful in destructors). +/// Non-teardown errors will throw. +#define MSCCLPP_CUTHROW_IGNORE_TEARDOWN(cmd) \ + do { \ + CUresult __e = cmd; \ + if (!mscclpp::isCuTeardownError(__e)) { \ + MSCCLPP_CUTHROW(__e); \ + } \ + } while (false) + +/// Execute a CUDA driver call and log (but don't throw) on error. +#define MSCCLPP_CUTHROW_IGNORE(cmd) \ + do { \ + CUresult __e = cmd; \ + if (__e != CUDA_SUCCESS) { \ + const char* errStr; \ + cuGetErrorString(__e, &errStr); \ + WARN(GPU, __FILE__, ":", __LINE__, " Cuda failure ", static_cast(__e), " '", errStr, "'"); \ + } \ + } while (false) + +#endif // MSCCLPP_GPU_UTILS_INTERNAL_HPP_ diff --git a/src/core/include/ib.hpp b/src/core/include/ib.hpp index c6436dbf..e9363e9c 100644 --- a/src/core/include/ib.hpp +++ b/src/core/include/ib.hpp @@ -17,6 +17,7 @@ struct ibv_qp; struct ibv_cq; struct ibv_wc; struct ibv_send_wr; +struct ibv_recv_wr; struct ibv_sge; namespace mscclpp { @@ -28,11 +29,11 @@ struct IbMrInfo { class IbMr { public: - virtual ~IbMr(); + ~IbMr(); - virtual IbMrInfo getInfo() const; - virtual const void* getBuff() const; - virtual uint32_t getLkey() const; + IbMrInfo getInfo() const; + const void* getBuff() const; + uint32_t getLkey() const; private: IbMr(ibv_pd* pd, void* buff, std::size_t size); @@ -52,7 +53,7 @@ struct IbQpInfo { uint64_t spn; int mtu; uint64_t iid; - bool is_grh; + bool isGrh; }; enum class WsStatus { @@ -61,38 +62,48 @@ enum class WsStatus { class IbQp { public: - virtual ~IbQp(); + ~IbQp(); - virtual void rtr([[maybe_unused]] const IbQpInfo& info); - virtual void rts(); - virtual void stageSend([[maybe_unused]] const IbMr* mr, [[maybe_unused]] const IbMrInfo& info, - [[maybe_unused]] uint32_t size, [[maybe_unused]] uint64_t wrId, - [[maybe_unused]] uint64_t srcOffset, [[maybe_unused]] uint64_t dstOffset, - [[maybe_unused]] bool signaled); - virtual void stageAtomicAdd([[maybe_unused]] const IbMr* mr, [[maybe_unused]] const IbMrInfo& info, - [[maybe_unused]] uint64_t wrId, [[maybe_unused]] uint64_t dstOffset, - [[maybe_unused]] uint64_t addVal, [[maybe_unused]] bool signaled); - virtual void stageSendWithImm([[maybe_unused]] const IbMr* mr, [[maybe_unused]] const IbMrInfo& info, - [[maybe_unused]] uint32_t size, [[maybe_unused]] uint64_t wrId, - [[maybe_unused]] uint64_t srcOffset, [[maybe_unused]] uint64_t dstOffset, - [[maybe_unused]] bool signaled, [[maybe_unused]] unsigned int immData); - virtual void postSend(); - virtual int pollCq(); + void rtr(const IbQpInfo& info); + void rts(); + void stageSendWrite(const IbMr* mr, const IbMrInfo& info, uint32_t size, uint64_t wrId, uint64_t srcOffset, + uint64_t dstOffset, bool signaled); + void stageSendAtomicAdd(const IbMr* mr, const IbMrInfo& info, uint64_t wrId, uint64_t dstOffset, uint64_t addVal, + bool signaled); + void stageSendWriteWithImm(const IbMr* mr, const IbMrInfo& info, uint32_t size, uint64_t wrId, uint64_t srcOffset, + uint64_t dstOffset, bool signaled, unsigned int immData); + void postSend(); + + void stageRecv(uint64_t wrId); + void stageRecv(const IbMr* mr, uint64_t wrId, uint32_t size, uint64_t offset = 0); + void postRecv(); + + int pollSendCq(); + int pollRecvCq(); IbQpInfo& getInfo() { return info_; } - virtual int getWcStatus([[maybe_unused]] int idx) const; - virtual std::string getWcStatusString([[maybe_unused]] int idx) const; - virtual int getNumCqItems() const; + int getSendWcStatus(int idx) const; + std::string getSendWcStatusString(int idx) const; + int getNumSendCqItems() const; + int getRecvWcStatus(int idx) const; + std::string getRecvWcStatusString(int idx) const; + unsigned int getRecvWcImmData(int idx) const; private: - struct WrInfo { + struct SendWrInfo { ibv_send_wr* wr; ibv_sge* sge; }; - IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxCqSize, int maxCqPollNum, int maxSendWr, + struct RecvWrInfo { + ibv_recv_wr* wr; + ibv_sge* sge; + }; + + IbQp(ibv_context* ctx, ibv_pd* pd, int portNum, int gidIndex, int maxSendCqSize, int maxSendCqPollNum, int maxSendWr, int maxRecvWr, int maxWrPerSend); - WrInfo getNewWrInfo(); + SendWrInfo getNewSendWrInfo(); + RecvWrInfo getNewRecvWrInfo(); int portNum_; int gidIndex_; @@ -100,16 +111,23 @@ class IbQp { IbQpInfo info_; ibv_qp* qp_; - ibv_cq* cq_; - std::shared_ptr> wcs_; - std::shared_ptr> wrs_; - std::shared_ptr> sges_; - int wrn_; - int numSignaledPostedItems_; - int numSignaledStagedItems_; + ibv_cq* sendCq_; + ibv_cq* recvCq_; + std::shared_ptr> sendWcs_; + std::shared_ptr> recvWcs_; + std::shared_ptr> sendWrs_; + std::shared_ptr> sendSges_; + std::shared_ptr> recvWrs_; + std::shared_ptr> recvSges_; + int numStagedSend_; + int numStagedRecv_; + int numPostedSignaledSend_; + int numStagedSignaledSend_; - const int maxCqPollNum_; + const int maxSendCqPollNum_; + const int maxSendWr_; const int maxWrPerSend_; + const int maxRecvWr_; friend class IbCtx; }; @@ -120,9 +138,10 @@ class IbCtx { IbCtx(const std::string& devName); ~IbCtx(); - std::shared_ptr createQp(int port, int gidIndex, int maxCqSize, int maxCqPollNum, int maxSendWr, int maxRecvWr, - int maxWrPerSend); + std::shared_ptr createQp(int port, int gidIndex, int maxSendCqSize, int maxSendCqPollNum, int maxSendWr, + int maxRecvWr, int maxWrPerSend); std::unique_ptr registerMr(void* buff, std::size_t size); + bool supportsRdmaAtomics() const; #else IbCtx([[maybe_unused]] const std::string& devName) {} ~IbCtx() {} @@ -131,6 +150,7 @@ class IbCtx { std::unique_ptr registerMr([[maybe_unused]] void* buff, [[maybe_unused]] std::size_t size) { return nullptr; } + bool supportsRdmaAtomics() const { return false; } #endif const std::string& getDevName() const { return devName_; }; @@ -142,6 +162,7 @@ class IbCtx { const std::string devName_; ibv_context* ctx_; ibv_pd* pd_; + bool supportsRdmaAtomics_; }; } // namespace mscclpp diff --git a/src/core/include/ibverbs_wrapper.hpp b/src/core/include/ibverbs_wrapper.hpp index b5ab2eff..5b0da8ba 100644 --- a/src/core/include/ibverbs_wrapper.hpp +++ b/src/core/include/ibverbs_wrapper.hpp @@ -102,6 +102,10 @@ struct IBVerbs { return qp->context->ops.post_send(qp, wr, bad_wr); } + static inline int ibv_post_recv(struct ibv_qp* qp, struct ibv_recv_wr* wr, struct ibv_recv_wr** bad_wr) { + return qp->context->ops.post_recv(qp, wr, bad_wr); + } + static inline int ibv_poll_cq(struct ibv_cq* cq, int num_entries, struct ibv_wc* wc) { return cq->context->ops.poll_cq(cq, num_entries, wc); } diff --git a/src/core/semaphore.cc b/src/core/semaphore.cc index 57ac5979..c6eb1e23 100644 --- a/src/core/semaphore.cc +++ b/src/core/semaphore.cc @@ -8,7 +8,6 @@ #include "atomic.hpp" #include "connection.hpp" #include "context.hpp" -#include "debug.h" #include "registered_memory.hpp" #include "serialization.hpp" @@ -122,6 +121,8 @@ MSCCLPP_API_CPP Host2DeviceSemaphore::Host2DeviceSemaphore(const Semaphore& sema if (connection().localDevice().type != DeviceType::GPU) { throw Error("Local endpoint device type of Host2DeviceSemaphore should be GPU", ErrorCode::InvalidUsage); } + BaseConnection::getImpl(connection()) + ->setRemoteUpdateDstAddr(reinterpret_cast(semaphore_.localMemory().data())); } MSCCLPP_API_CPP Host2DeviceSemaphore::Host2DeviceSemaphore(Communicator& communicator, const Connection& connection) @@ -150,6 +151,8 @@ MSCCLPP_API_CPP Host2HostSemaphore::Host2HostSemaphore(const Semaphore& semaphor if (connection().localDevice().type != DeviceType::CPU) { throw Error("Local endpoint device type of Host2HostSemaphore should be CPU", ErrorCode::InvalidUsage); } + BaseConnection::getImpl(connection()) + ->setRemoteUpdateDstAddr(reinterpret_cast(semaphore_.localMemory().data())); } MSCCLPP_API_CPP Host2HostSemaphore::Host2HostSemaphore(Communicator& communicator, const Connection& connection) diff --git a/test/mp_unit/ib_tests.cu b/test/mp_unit/ib_tests.cu index 8475ccf9..051030ac 100644 --- a/test/mp_unit/ib_tests.cu +++ b/test/mp_unit/ib_tests.cu @@ -63,20 +63,21 @@ void IbPeerToPeerTest::registerBufferAndConnect(void* buf, size_t size) { bootstrap->barrier(); } -void IbPeerToPeerTest::stageSend(uint32_t size, uint64_t wrId, uint64_t srcOffset, uint64_t dstOffset, bool signaled) { +void IbPeerToPeerTest::stageSendWrite(uint32_t size, uint64_t wrId, uint64_t srcOffset, uint64_t dstOffset, + bool signaled) { const mscclpp::IbMrInfo& remoteMrInfo = mrInfo[(gEnv->rank == 1) ? 0 : 1]; - qp->stageSend(mr.get(), remoteMrInfo, size, wrId, srcOffset, dstOffset, signaled); + qp->stageSendWrite(mr.get(), remoteMrInfo, size, wrId, srcOffset, dstOffset, signaled); } -void IbPeerToPeerTest::stageAtomicAdd(uint64_t wrId, uint64_t dstOffset, uint64_t addVal, bool signaled) { +void IbPeerToPeerTest::stageSendAtomicAdd(uint64_t wrId, uint64_t dstOffset, uint64_t addVal, bool signaled) { const mscclpp::IbMrInfo& remoteMrInfo = mrInfo[(gEnv->rank == 1) ? 0 : 1]; - qp->stageAtomicAdd(mr.get(), remoteMrInfo, wrId, dstOffset, addVal, signaled); + qp->stageSendAtomicAdd(mr.get(), remoteMrInfo, wrId, dstOffset, addVal, signaled); } -void IbPeerToPeerTest::stageSendWithImm(uint32_t size, uint64_t wrId, uint64_t srcOffset, uint64_t dstOffset, - bool signaled, unsigned int immData) { +void IbPeerToPeerTest::stageSendWriteWithImm(uint32_t size, uint64_t wrId, uint64_t srcOffset, uint64_t dstOffset, + bool signaled, unsigned int immData) { const mscclpp::IbMrInfo& remoteMrInfo = mrInfo[(gEnv->rank == 1) ? 0 : 1]; - qp->stageSendWithImm(mr.get(), remoteMrInfo, size, wrId, srcOffset, dstOffset, signaled, immData); + qp->stageSendWriteWithImm(mr.get(), remoteMrInfo, size, wrId, srcOffset, dstOffset, signaled, immData); } TEST_F(IbPeerToPeerTest, SimpleSendRecv) { @@ -96,15 +97,15 @@ TEST_F(IbPeerToPeerTest, SimpleSendRecv) { if (gEnv->rank == 1) { mscclpp::Timer timer; for (int iter = 0; iter < maxIter; ++iter) { - stageSend(sizeof(uint64_t) * nelem, 0, 0, 0, true); + stageSendWrite(sizeof(uint64_t) * nelem, 0, 0, 0, true); qp->postSend(); bool waiting = true; int spin = 0; while (waiting) { - int wcNum = qp->pollCq(); + int wcNum = qp->pollSendCq(); ASSERT_GE(wcNum, 0); for (int i = 0; i < wcNum; ++i) { - int status = qp->getWcStatus(i); + int status = qp->getSendWcStatus(i); EXPECT_EQ(status, static_cast(mscclpp::WsStatus::Success)); waiting = false; break; @@ -261,26 +262,26 @@ TEST_F(IbPeerToPeerTest, MemoryConsistency) { bool signaled = (iter % signalPeriod == 0); // Send from the second element to the last - stageSend(sizeof(uint64_t) * (nelem - 1), 0, sizeof(uint64_t), sizeof(uint64_t), signaled); + stageSendWrite(sizeof(uint64_t) * (nelem - 1), 0, sizeof(uint64_t), sizeof(uint64_t), signaled); qp->postSend(); #if 0 // For reference: send the first element using a normal send. This should occasionally see a wrong result. - stageSend(sizeof(uint64_t), 0, 0, 0, false); + stageSendWrite(sizeof(uint64_t), 0, 0, 0, false); qp->postSend(); #else // Send the first element using AtomicAdd. This should see the correct result. - stageAtomicAdd(0, 0, 1, false); + stageSendAtomicAdd(0, 0, 1, false); qp->postSend(); #endif if (signaled) { - int wcNum = qp->pollCq(); + int wcNum = qp->pollSendCq(); while (wcNum == 0) { - wcNum = qp->pollCq(); + wcNum = qp->pollSendCq(); } ASSERT_EQ(wcNum, 1); - int status = qp->getWcStatus(0); + int status = qp->getSendWcStatus(0); ASSERT_EQ(status, static_cast(mscclpp::WsStatus::Success)); } @@ -319,17 +320,17 @@ TEST_F(IbPeerToPeerTest, SimpleAtomicAdd) { if (gEnv->rank == 1) { mscclpp::Timer timer; for (int iter = 0; iter < maxIter; ++iter) { - stageAtomicAdd(0, 0, 1, true); + stageSendAtomicAdd(0, 0, 1, true); qp->postSend(); bool waiting = true; int spin = 0; while (waiting) { - int wcNum = qp->pollCq(); + int wcNum = qp->pollSendCq(); ASSERT_GE(wcNum, 0); for (int i = 0; i < wcNum; ++i) { - int status = qp->getWcStatus(i); + int status = qp->getSendWcStatus(i); if (status != static_cast(mscclpp::WsStatus::Success)) { - FAIL() << "Work completion status error: " << qp->getWcStatusString(i); + FAIL() << "Work completion status error: " << qp->getSendWcStatusString(i); } waiting = false; break; diff --git a/test/mp_unit/mp_unit_tests.hpp b/test/mp_unit/mp_unit_tests.hpp index bad80f0a..17046a57 100644 --- a/test/mp_unit/mp_unit_tests.hpp +++ b/test/mp_unit/mp_unit_tests.hpp @@ -71,12 +71,12 @@ class IbPeerToPeerTest : public IbTestBase { void registerBufferAndConnect(void* buf, size_t size); - void stageSend(uint32_t size, uint64_t wrId, uint64_t srcOffset, uint64_t dstOffset, bool signaled); + void stageSendWrite(uint32_t size, uint64_t wrId, uint64_t srcOffset, uint64_t dstOffset, bool signaled); - void stageAtomicAdd(uint64_t wrId, uint64_t dstOffset, uint64_t addVal, bool signaled); + void stageSendAtomicAdd(uint64_t wrId, uint64_t dstOffset, uint64_t addVal, bool signaled); - void stageSendWithImm(uint32_t size, uint64_t wrId, uint64_t srcOffset, uint64_t dstOffset, bool signaled, - unsigned int immData); + void stageSendWriteWithImm(uint32_t size, uint64_t wrId, uint64_t srcOffset, uint64_t dstOffset, bool signaled, + unsigned int immData); std::shared_ptr bootstrap; std::shared_ptr ibCtx; @@ -131,6 +131,8 @@ class CommunicatorTest : public CommunicatorTestBase { template using DeviceHandle = mscclpp::DeviceHandle; +using IbMode = mscclpp::EndpointConfig::Ib::Mode; + class PortChannelOneToOneTest : public CommunicatorTestBase { protected: struct PingPongTestParams { @@ -138,17 +140,19 @@ class PortChannelOneToOneTest : public CommunicatorTestBase { bool useIB; bool useEthernet; bool waitWithPoll; + IbMode ibMode; }; void SetUp() override; void TearDown() override; void setupMeshConnections(std::vector& portChannels, bool useIPC, bool useIb, bool useEthernet, - void* sendBuff, size_t sendBuffBytes, void* recvBuff = nullptr, size_t recvBuffBytes = 0); + void* sendBuff, size_t sendBuffBytes, void* recvBuff = nullptr, size_t recvBuffBytes = 0, + IbMode ibMode = IbMode::Default); void testPingPong(PingPongTestParams params); void testPingPongPerf(PingPongTestParams params); - void testPacketPingPong(bool useIbOnly); - void testPacketPingPongPerf(bool useIbOnly); + void testPacketPingPong(bool useIbOnly, IbMode ibMode = IbMode::Default); + void testPacketPingPongPerf(bool useIbOnly, IbMode ibMode = IbMode::Default); std::shared_ptr proxyService; }; diff --git a/test/mp_unit/port_channel_tests.cu b/test/mp_unit/port_channel_tests.cu index cbd5cb6d..7cc5954a 100644 --- a/test/mp_unit/port_channel_tests.cu +++ b/test/mp_unit/port_channel_tests.cu @@ -18,7 +18,7 @@ void PortChannelOneToOneTest::TearDown() { CommunicatorTestBase::TearDown(); } void PortChannelOneToOneTest::setupMeshConnections(std::vector& portChannels, bool useIPC, bool useIb, bool useEthernet, void* sendBuff, size_t sendBuffBytes, - void* recvBuff, size_t recvBuffBytes) { + void* recvBuff, size_t recvBuffBytes, IbMode ibMode) { const int rank = communicator->bootstrap()->getRank(); const int worldSize = communicator->bootstrap()->getNranks(); const bool isInPlace = (recvBuff == nullptr); @@ -47,6 +47,7 @@ void PortChannelOneToOneTest::setupMeshConnections(std::vectorargs["ib_gid_index"]); + cfg.ib.mode = ibMode; } else if (useEthernet) { cfg.transport = mscclpp::Transport::Ethernet; } @@ -162,7 +163,8 @@ void PortChannelOneToOneTest::testPingPong(PingPongTestParams params) { std::vector portChannels; std::shared_ptr buff = mscclpp::GpuBuffer(nElem).memory(); - setupMeshConnections(portChannels, params.useIPC, params.useIB, params.useEthernet, buff.get(), nElem * sizeof(int)); + setupMeshConnections(portChannels, params.useIPC, params.useIB, params.useEthernet, buff.get(), nElem * sizeof(int), + nullptr, 0, params.ibMode); std::vector> portChannelHandles; for (auto& ch : portChannels) portChannelHandles.push_back(ch.deviceHandle()); @@ -207,7 +209,8 @@ void PortChannelOneToOneTest::testPingPongPerf(PingPongTestParams params) { std::vector portChannels; std::shared_ptr buff = mscclpp::GpuBuffer(nElem).memory(); - setupMeshConnections(portChannels, params.useIPC, params.useIB, params.useEthernet, buff.get(), nElem * sizeof(int)); + setupMeshConnections(portChannels, params.useIPC, params.useIB, params.useEthernet, buff.get(), nElem * sizeof(int), + nullptr, 0, params.ibMode); std::vector> portChannelHandles; for (auto& ch : portChannels) portChannelHandles.push_back(ch.deviceHandle()); @@ -245,47 +248,64 @@ void PortChannelOneToOneTest::testPingPongPerf(PingPongTestParams params) { } TEST_F(PortChannelOneToOneTest, PingPong) { - testPingPong(PingPongTestParams{.useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = false}); + testPingPong(PingPongTestParams{ + .useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Default}); } -TEST_F(PortChannelOneToOneTest, PingPongIb) { +TEST_F(PortChannelOneToOneTest, PingPongIbHostMode) { #if defined(USE_IBVERBS) - testPingPong(PingPongTestParams{.useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false}); + testPingPong(PingPongTestParams{ + .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Host}); #else // !defined(USE_IBVERBS) GTEST_SKIP() << "This test requires IBVerbs that the current build does not support."; #endif // !defined(USE_IBVERBS) } TEST_F(PortChannelOneToOneTest, PingPongEthernet) { - testPingPong(PingPongTestParams{.useIPC = false, .useIB = false, .useEthernet = true, .waitWithPoll = false}); + testPingPong(PingPongTestParams{ + .useIPC = false, .useIB = false, .useEthernet = true, .waitWithPoll = false, .ibMode = IbMode::Default}); } TEST_F(PortChannelOneToOneTest, PingPongWithPoll) { - testPingPong(PingPongTestParams{.useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = true}); + testPingPong(PingPongTestParams{ + .useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = true, .ibMode = IbMode::Default}); } -TEST_F(PortChannelOneToOneTest, PingPongIbWithPoll) { +TEST_F(PortChannelOneToOneTest, PingPongIbHostModeWithPoll) { #if defined(USE_IBVERBS) - testPingPong(PingPongTestParams{.useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = true}); + testPingPong(PingPongTestParams{ + .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = true, .ibMode = IbMode::Host}); #else // !defined(USE_IBVERBS) GTEST_SKIP() << "This test requires IBVerbs that the current build does not support."; #endif // !defined(USE_IBVERBS) } TEST_F(PortChannelOneToOneTest, PingPongPerf) { - testPingPongPerf(PingPongTestParams{.useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = false}); + testPingPongPerf(PingPongTestParams{ + .useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Default}); } -TEST_F(PortChannelOneToOneTest, PingPongPerfIb) { +TEST_F(PortChannelOneToOneTest, PingPongPerfIbHostMode) { #if defined(USE_IBVERBS) - testPingPongPerf(PingPongTestParams{.useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false}); + testPingPongPerf(PingPongTestParams{ + .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::Host}); +#else // !defined(USE_IBVERBS) + GTEST_SKIP() << "This test requires IBVerbs that the current build does not support."; +#endif // !defined(USE_IBVERBS) +} + +TEST_F(PortChannelOneToOneTest, PingPongPerfIbHostNoAtomicMode) { +#if defined(USE_IBVERBS) + testPingPongPerf(PingPongTestParams{ + .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::HostNoAtomic}); #else // !defined(USE_IBVERBS) GTEST_SKIP() << "This test requires IBVerbs that the current build does not support."; #endif // !defined(USE_IBVERBS) } TEST_F(PortChannelOneToOneTest, PingPongPerfEthernet) { - testPingPongPerf(PingPongTestParams{.useIPC = false, .useIB = false, .useEthernet = true, .waitWithPoll = false}); + testPingPongPerf(PingPongTestParams{ + .useIPC = false, .useIB = false, .useEthernet = true, .waitWithPoll = false, .ibMode = IbMode::Default}); } __device__ mscclpp::DeviceSyncer gChannelOneToOneTestPortChansSyncer; @@ -354,7 +374,7 @@ __global__ void kernelProxyLLPingPong(int* buff, mscclpp::LLPacket* putPktBuf, m } } -void PortChannelOneToOneTest::testPacketPingPong(bool useIb) { +void PortChannelOneToOneTest::testPacketPingPong(bool useIb, IbMode ibMode) { if (gEnv->rank >= numRanksToUse) return; const int nElem = 4 * 1024 * 1024; @@ -367,7 +387,7 @@ void PortChannelOneToOneTest::testPacketPingPong(bool useIb) { auto getPacketBuffer = mscclpp::GpuBuffer(nPacket).memory(); setupMeshConnections(portChannels, !useIb, useIb, false, putPacketBuffer.get(), nPacket * sizeof(mscclpp::LLPacket), - getPacketBuffer.get(), nPacket * sizeof(mscclpp::LLPacket)); + getPacketBuffer.get(), nPacket * sizeof(mscclpp::LLPacket), ibMode); ASSERT_EQ(portChannels.size(), 1); @@ -421,7 +441,7 @@ void PortChannelOneToOneTest::testPacketPingPong(bool useIb) { proxyService->stopProxy(); } -void PortChannelOneToOneTest::testPacketPingPongPerf(bool useIb) { +void PortChannelOneToOneTest::testPacketPingPongPerf(bool useIb, IbMode ibMode) { if (gEnv->rank >= numRanksToUse) return; const int nElem = 4 * 1024 * 1024; @@ -434,7 +454,7 @@ void PortChannelOneToOneTest::testPacketPingPongPerf(bool useIb) { auto getPacketBuffer = mscclpp::GpuBuffer(nPacket).memory(); setupMeshConnections(portChannels, !useIb, useIb, false, putPacketBuffer.get(), nPacket * sizeof(mscclpp::LLPacket), - getPacketBuffer.get(), nPacket * sizeof(mscclpp::LLPacket)); + getPacketBuffer.get(), nPacket * sizeof(mscclpp::LLPacket), ibMode); ASSERT_EQ(portChannels.size(), 1); @@ -477,21 +497,46 @@ void PortChannelOneToOneTest::testPacketPingPongPerf(bool useIb) { proxyService->stopProxy(); } -TEST_F(PortChannelOneToOneTest, PacketPingPong) { testPacketPingPong(false); } +TEST_F(PortChannelOneToOneTest, PacketPingPong) { testPacketPingPong(false, IbMode::Default); } -TEST_F(PortChannelOneToOneTest, PacketPingPongIb) { +TEST_F(PortChannelOneToOneTest, PacketPingPongIbHostMode) { #if defined(USE_IBVERBS) - testPacketPingPong(true); + testPacketPingPong(true, IbMode::Host); #else // !defined(USE_IBVERBS) GTEST_SKIP() << "This test requires IBVerbs that the current build does not support."; #endif // !defined(USE_IBVERBS) } -TEST_F(PortChannelOneToOneTest, PacketPingPongPerf) { testPacketPingPongPerf(false); } +TEST_F(PortChannelOneToOneTest, PacketPingPongPerf) { testPacketPingPongPerf(false, IbMode::Default); } -TEST_F(PortChannelOneToOneTest, PacketPingPongPerfIb) { +TEST_F(PortChannelOneToOneTest, PacketPingPongPerfIbHostMode) { #if defined(USE_IBVERBS) - testPacketPingPongPerf(true); + testPacketPingPongPerf(true, IbMode::Host); +#else // !defined(USE_IBVERBS) + GTEST_SKIP() << "This test requires IBVerbs that the current build does not support."; +#endif // !defined(USE_IBVERBS) +} + +TEST_F(PortChannelOneToOneTest, PacketPingPongPerfIbHostNoAtomicMode) { +#if defined(USE_IBVERBS) + testPacketPingPongPerf(true, IbMode::HostNoAtomic); +#else // !defined(USE_IBVERBS) + GTEST_SKIP() << "This test requires IBVerbs that the current build does not support."; +#endif // !defined(USE_IBVERBS) +} + +TEST_F(PortChannelOneToOneTest, PingPongIbHostNoAtomicMode) { +#if defined(USE_IBVERBS) + testPingPong(PingPongTestParams{ + .useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false, .ibMode = IbMode::HostNoAtomic}); +#else // !defined(USE_IBVERBS) + GTEST_SKIP() << "This test requires IBVerbs that the current build does not support."; +#endif // !defined(USE_IBVERBS) +} + +TEST_F(PortChannelOneToOneTest, PacketPingPongIbHostNoAtomicMode) { +#if defined(USE_IBVERBS) + testPacketPingPong(true, IbMode::HostNoAtomic); #else // !defined(USE_IBVERBS) GTEST_SKIP() << "This test requires IBVerbs that the current build does not support."; #endif // !defined(USE_IBVERBS) From dff3bc7bbb4c38d71d918209513e513799fba69a Mon Sep 17 00:00:00 2001 From: Caio Rocha <164253795+caiomcbr@users.noreply.github.com> Date: Thu, 12 Feb 2026 17:27:20 -0800 Subject: [PATCH 09/52] Support Fusion for ReadPutPacket Operation at DSL (#742) Support is being added for fusing the ReadPutPacket operation on DSL, which reduces the overhead caused by reading packet data multiple times in the scratch buffer. Fusion will occur when two rppkt operations are executed consecutively with the same src_buffer: rppkt(src, dst0) + rppkt(src, dst1) -> rppkt(src, [dst0, dst1] Co-authored-by: Binyang Li --- .../mscclpp/language/internal/operations.py | 20 +++++ .../tests/single_node/allgather_pkt_rppkt.py | 78 +++++++++++++++++++ python/test/executor_test.py | 2 +- src/core/include/execution_kernel.hpp | 14 ++-- 4 files changed, 105 insertions(+), 9 deletions(-) create mode 100644 python/mscclpp/language/tests/single_node/allgather_pkt_rppkt.py diff --git a/python/mscclpp/language/internal/operations.py b/python/mscclpp/language/internal/operations.py index 127f4a03..5f719c21 100644 --- a/python/mscclpp/language/internal/operations.py +++ b/python/mscclpp/language/internal/operations.py @@ -534,6 +534,7 @@ class PutOperation(BaseOperation): self.dst_buff = dst_buff self.channel_ids = channel_ids self.channel_type = channel_type + self.from_packet = from_packet self.to_packet = to_packet self.with_signal = with_signal self.with_signal_and_flush = with_signal_and_flush @@ -579,6 +580,25 @@ class PutOperation(BaseOperation): with_signal=self.with_signal, with_signal_and_flush=self.with_signal_and_flush, ) + elif ( + isinstance(other, PutOperation) + and self.name == Instruction.read_put_packet + and self.name == other.name + and self.src_buff == other.src_buff + and self.channel_type == other.channel_type + and self.tbg_info == other.tbg_info + ): + fused_operation = PutOperation( + src_buff=self.src_buff, + dst_buff=self.dst_buff + other.dst_buff, + channel_ids=self.channel_ids + other.channel_ids, + channel_type=self.channel_type, + tbg_info=self.tbg_info, + from_packet=self.from_packet, + to_packet=self.to_packet, + with_signal=self.with_signal, + with_signal_and_flush=self.with_signal_and_flush, + ) return fused_operation diff --git a/python/mscclpp/language/tests/single_node/allgather_pkt_rppkt.py b/python/mscclpp/language/tests/single_node/allgather_pkt_rppkt.py new file mode 100644 index 00000000..bda9e36c --- /dev/null +++ b/python/mscclpp/language/tests/single_node/allgather_pkt_rppkt.py @@ -0,0 +1,78 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import argparse +from mscclpp.language.channel import * +from mscclpp.language.rank import * +from mscclpp.language.general import * +from mscclpp.language.program import * +from mscclpp.language.collectives import * + + +def allgather_example(name, gpu_size, num_threads_per_block, min_message_size, max_message_size): + chunksperloop = 1 + collective = AllGather(gpu_size, chunksperloop, True) + with CollectiveProgram( + name, + collective, + gpu_size, + protocol="LL", + num_threads_per_block=num_threads_per_block, + use_double_scratch_buffer=True, + min_message_size=min_message_size, + max_message_size=max_message_size, + ): + # Creating Scratch Buffers + scratch_buffer = [] + for gpu in range(gpu_size): + scratch_buffer.append(Buffer(gpu, 2 * gpu_size)) + + # Copying it to scratch buffer + for gpu in range(gpu_size): + rank = Rank(gpu) + scratch_offset = gpu_size + input_buffer = rank.get_input_buffer() + rank.copy_packets( + scratch_buffer[gpu][scratch_offset + gpu : scratch_offset + gpu + 1], input_buffer[0:1], tb=0 + ) + + # Putting packets in the remote scratch buffer + for gpu in range(gpu_size): + rank = Rank(gpu) + output_buffer = rank.get_output_buffer() + for peer in range(1, gpu_size): + dst_rank = (gpu + peer) % gpu_size + ch = MemoryChannel(dst_rank, gpu) + tb = 0 + ch.read_put_packets( + scratch_buffer[dst_rank][gpu : gpu + 1], + scratch_buffer[gpu][scratch_offset + gpu : scratch_offset + gpu + 1], + tb, + ) + + # Copying packets from local scratch buffer to local buffer + for gpu in range(gpu_size): + rank = Rank(gpu) + output_buffer = rank.get_output_buffer() + for peer in range(1, gpu_size): + dst_rank = (gpu + peer) % gpu_size + rank.unpack_packets( + output_buffer[dst_rank : dst_rank + 1], + scratch_buffer[gpu][dst_rank : dst_rank + 1], + tb=0, + ) + + print(JSON()) + + +parser = argparse.ArgumentParser() + +parser.add_argument("--name", type=str, help="name of the program") +parser.add_argument("--num_gpus", type=int, help="number of gpus") +parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block") +parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size") +parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size") + +args = parser.parse_args() + +allgather_example(args.name, args.num_gpus, args.num_threads_per_block, args.min_message_size, args.max_message_size) diff --git a/python/test/executor_test.py b/python/test/executor_test.py index 49e5166f..59bc1661 100644 --- a/python/test/executor_test.py +++ b/python/test/executor_test.py @@ -11,7 +11,7 @@ from mscclpp import ( env, ) from mscclpp import CommGroup, GpuBuffer -from mscclpp.utils import KernelBuilder, GpuBuffer, pack +from mscclpp.utils import KernelBuilder, pack import os import struct diff --git a/src/core/include/execution_kernel.hpp b/src/core/include/execution_kernel.hpp index 918bff61..74283244 100644 --- a/src/core/include/execution_kernel.hpp +++ b/src/core/include/execution_kernel.hpp @@ -298,11 +298,11 @@ MSCCLPP_DEVICE_INLINE void handleReadPutPackets(const Operation& op, void* scrat ChannelType chType = op.channelType; if (chType == ChannelType::MEMORY) { size_t nPackets = size / sizeof(PacketPayload); + PacketType* pkts = (PacketType*)((char*)scratch + scratchOffset_ + (srcOffsets[0] << 1)); for (size_t pktIdx = threadIdx.x; pktIdx < nPackets; pktIdx += blockDim.x) { + PacketPayload data = pkts[pktIdx].read(flag_); + PacketType pkt(data, flag_); for (uint32_t idx = 0; idx < nOutput; ++idx) { - PacketType* pkts = (PacketType*)((char*)scratch + scratchOffset_ + (srcOffsets[idx] << 1)); - PacketPayload data = pkts[pktIdx].read(flag_); - PacketType pkt(data, flag_); size_t offset = (scratchOffset_ + (dstOffsets[idx] << 1)) / sizeof(PacketType); void* remoteMemory = static_cast(memoryChannelBufferPtrs_[op.outputBufferRefs[idx].id]); mscclpp::write(remoteMemory, offset + pktIdx, pkt); @@ -312,10 +312,8 @@ MSCCLPP_DEVICE_INLINE void handleReadPutPackets(const Operation& op, void* scrat // Ensuring Data Is Ready size_t nPackets = size / sizeof(PacketPayload); for (size_t pktIdx = threadIdx.x; pktIdx < nPackets; pktIdx += blockDim.x) { - for (uint32_t idx = 0; idx < nOutput; ++idx) { - PacketType* pkts = (PacketType*)((char*)scratch + scratchOffset_ + (srcOffsets[idx] << 1)); - pkts[pktIdx].read(flag_); - } + PacketType* pkts = (PacketType*)((char*)scratch + scratchOffset_ + (srcOffsets[0] << 1)); + pkts[pktIdx].read(flag_); } __syncthreads(); @@ -325,7 +323,7 @@ MSCCLPP_DEVICE_INLINE void handleReadPutPackets(const Operation& op, void* scrat return; } uint32_t dstOffset = (dstOffsets[chIdx] << 1) + scratchOffset_; - uint32_t srcOffset = (srcOffsets[chIdx] << 1) + scratchOffset_; + uint32_t srcOffset = (srcOffsets[0] << 1) + scratchOffset_; MemoryId dstMemoryId = portChannelBufferIds_[op.outputBufferRefs[chIdx].id]; portChannels_[channelIndexes[chIdx]].put( dstMemoryId, dstOffset, static_cast(BufferType::SCRATCH) + localMemoryIdBegin_, srcOffset, size << 1); From bd68319e3eabe5d5370042ce1234e047032e8731 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Thu, 12 Feb 2026 19:06:18 -0800 Subject: [PATCH 10/52] Refactor algo selection logic and introduce symmetric_memory env (#741) This PR refactors the algorithm selection logic in MSCCL++ and introduces support for symmetric memory configuration through environment variables. 1. Algorithm Selection Refactoring Use separate class for algo selection. Could introduce more complex logic for algo selection based on message size, arch, if cuda graph is enabled and memory allocation method 2. Symmetric Memory Support Introduced symmetricMemory parameter in algorithm context key generation. Remove disableChannelCache env as is ambiguous 3. Add new args for build_default_algorithms Add flag_buffer, and flag_buffer_size args to build default algorithm. Then we could use unified flag buffer for different algorithms, avoid application hanging when switch algo for different message size. --------- Co-authored-by: chhwang <8018170+chhwang@users.noreply.github.com> Co-authored-by: Qinghua Zhou Co-authored-by: Caio Rocha --- docs/guide/mscclpp-torch-integration.md | 4 +- .../customized_allgather.cu | 7 +- .../torch-integration/customized_allgather.cu | 7 +- .../customized_comm_with_default_algo.py | 4 +- include/mscclpp/algorithm.hpp | 12 +- include/mscclpp/env.hpp | 11 +- .../algorithm_collection_builder.hpp | 6 +- include/mscclpp/gpu.hpp | 5 + python/csrc/algorithm.cpp | 18 +- python/csrc/core_py.cpp | 4 +- .../ext/algorithm_collection_builder_py.cpp | 2 +- python/mscclpp/_core/algorithm.py | 23 ++ .../ext/algorithm_collection_builder.py | 16 +- python/mscclpp/utils.py | 6 + src/core/algorithm.cc | 21 +- src/core/env.cpp | 4 +- .../algorithm_collection_builder.cc | 18 +- .../allgather/allgather_fullmesh.cu | 7 +- .../allgather/allgather_fullmesh_2.cu | 19 +- .../allreduce/allreduce_allpair_packet.cu | 83 ++---- .../allreduce/allreduce_fullmesh.cu | 29 +- .../collectives/allreduce/allreduce_nvls.cu | 45 +++- .../allreduce/allreduce_nvls_packet.cu | 82 ++---- .../allreduce/allreduce_nvls_with_copy.cu | 11 +- .../allreduce/allreduce_nvls_with_copy_2.cu | 11 +- .../collectives/allreduce/allreduce_packet.cu | 48 ++-- .../include/allgather/allgather_fullmesh.hpp | 2 +- .../allgather/allgather_fullmesh_2.hpp | 6 +- .../allreduce/allreduce_allpair_packet.hpp | 14 +- .../include/allreduce/allreduce_fullmesh.hpp | 3 +- .../include/allreduce/allreduce_nvls.hpp | 4 +- .../allreduce/allreduce_nvls_packet.hpp | 14 +- .../allreduce/allreduce_nvls_with_copy.hpp | 2 +- .../allreduce/allreduce_nvls_with_copy_2.hpp | 2 +- .../include/allreduce/allreduce_packet.hpp | 12 +- .../collectives/include/allreduce/common.hpp | 2 +- .../collectives/include/collective_utils.hpp | 1 - src/ext/nccl/algorithm_selector.cc | 172 ++++++++++++ src/ext/nccl/algorithm_selector.hpp | 48 ++++ src/ext/nccl/datatype_conversion.hpp | 6 +- src/ext/nccl/{nccl.cu => nccl.cc} | 249 ++++++++---------- test/torch/allreduce_temp_buff.py | 4 +- test/torch/memory_report.py | 2 +- 43 files changed, 657 insertions(+), 389 deletions(-) create mode 100644 src/ext/nccl/algorithm_selector.cc create mode 100644 src/ext/nccl/algorithm_selector.hpp rename src/ext/nccl/{nccl.cu => nccl.cc} (82%) diff --git a/docs/guide/mscclpp-torch-integration.md b/docs/guide/mscclpp-torch-integration.md index 236dd8ef..6e3dc20b 100644 --- a/docs/guide/mscclpp-torch-integration.md +++ b/docs/guide/mscclpp-torch-integration.md @@ -343,8 +343,8 @@ public: }, // Context key generation function [self](const void* input, void* output, - size_t inputSize, size_t outputSize, mscclpp::DataType dtype) { - return self->generateContextKey(input, output, inputSize, outputSize, dtype); + size_t inputSize, size_t outputSize, mscclpp::DataType dtype, bool symmetricMemory) { + return self->generateContextKey(input, output, inputSize, outputSize, dtype, symmetricMemory); } ); } diff --git a/examples/customized-collective-algorithm/customized_allgather.cu b/examples/customized-collective-algorithm/customized_allgather.cu index 436a6a94..e78c4777 100644 --- a/examples/customized-collective-algorithm/customized_allgather.cu +++ b/examples/customized-collective-algorithm/customized_allgather.cu @@ -107,9 +107,10 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder { [self](std::shared_ptr comm, const void* input, void* output, size_t inputSize, size_t outputSize, mscclpp::DataType dtype) { return self->initAllgatherContext(comm, input, output, inputSize, dtype); }, - [self](const void* input, void* output, size_t inputSize, size_t outputSize, mscclpp::DataType dtype) { + [self](const void* input, void* output, size_t inputSize, size_t outputSize, mscclpp::DataType dtype, + bool symmetricMemory) { return self->generateAllgatherContextKey(input, output, inputSize, outputSize, - static_cast(dtype)); + static_cast(dtype), symmetricMemory); }); return allgatherAlgo; } @@ -191,7 +192,7 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder { } mscclpp::AlgorithmCtxKey generateAllgatherContextKey(const void* input, void* output, size_t inputSize, - size_t outputSize, ncclDataType_t dtype) { + size_t outputSize, ncclDataType_t dtype, bool) { return {(void*)input, output, inputSize, outputSize, 0}; } }; diff --git a/examples/torch-integration/customized_allgather.cu b/examples/torch-integration/customized_allgather.cu index 10400ddc..d48c4410 100644 --- a/examples/torch-integration/customized_allgather.cu +++ b/examples/torch-integration/customized_allgather.cu @@ -75,8 +75,9 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder { [self](std::shared_ptr comm, const void* input, void* output, size_t inputSize, size_t outputSize, mscclpp::DataType dtype) { return self->initAllgatherContext(comm, input, output, inputSize, dtype); }, - [self](const void* input, void* output, size_t inputSize, size_t outputSize, mscclpp::DataType dtype) { - return self->generateAllgatherContextKey(input, output, inputSize, outputSize, dtype); + [self](const void* input, void* output, size_t inputSize, size_t outputSize, mscclpp::DataType dtype, + bool symmetricMemory) { + return self->generateAllgatherContextKey(input, output, inputSize, outputSize, dtype, symmetricMemory); }); return allgatherAlgo; } @@ -159,7 +160,7 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder { } mscclpp::AlgorithmCtxKey generateAllgatherContextKey(const void* input, void* output, size_t inputSize, - size_t outputSize, mscclpp::DataType dtype) { + size_t outputSize, mscclpp::DataType dtype, bool) { return {(void*)input, output, inputSize, outputSize, 0}; } }; diff --git a/examples/torch-integration/customized_comm_with_default_algo.py b/examples/torch-integration/customized_comm_with_default_algo.py index 78560f15..281169cc 100644 --- a/examples/torch-integration/customized_comm_with_default_algo.py +++ b/examples/torch-integration/customized_comm_with_default_algo.py @@ -15,7 +15,9 @@ import ipaddress def load_algorithms(scratch_buffer: torch.tensor, rank: int) -> mscclpp.AlgorithmCollection: collection_builder = mscclpp.ext.AlgorithmCollectionBuilder() return collection_builder.build_default_algorithms( - scratch_buffer=scratch_buffer.data_ptr(), scratch_buffer_size=scratch_buffer.nbytes, rank=rank + scratch_buffer=scratch_buffer.data_ptr(), + scratch_buffer_size=scratch_buffer.nbytes, + rank=rank, ) diff --git a/include/mscclpp/algorithm.hpp b/include/mscclpp/algorithm.hpp index 7acdb8b8..6cc05ad4 100644 --- a/include/mscclpp/algorithm.hpp +++ b/include/mscclpp/algorithm.hpp @@ -96,11 +96,13 @@ class Algorithm { /// @param executor The executor for DSL algorithms (may be nullptr for native). /// @param nBlocks Number of CUDA blocks (0 for auto-selection). /// @param nThreadsPerBlock Number of threads per block (0 for auto-selection). + /// @param symmetricMemory Whether to use symmetric memory optimization. /// @param extras Additional parameters for algorithm-specific customization. /// @return The result of the operation. virtual CommResult execute(std::shared_ptr comm, const void* input, void* output, size_t inputSize, size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, std::shared_ptr executor, int nBlocks = 0, int nThreadsPerBlock = 0, + bool symmetricMemory = false, const std::unordered_map& extras = {}) = 0; /// Reset the algorithm state, clearing any cached contexts. @@ -201,9 +203,10 @@ class NativeAlgorithm : public Algorithm { /// @param inputSize Size of the input buffer. /// @param outputSize Size of the output buffer. /// @param dtype Data type of the elements. + /// @param symmetricMemory Whether symmetric memory is enabled. /// @return A key uniquely identifying this buffer configuration. using ContextKeyGenFunc = std::function; + size_t outputSize, DataType dtype, bool symmetricMemory)>; /// Construct a NativeAlgorithm. /// @param name Human-readable name of the algorithm. @@ -225,6 +228,7 @@ class NativeAlgorithm : public Algorithm { CommResult execute(std::shared_ptr comm, const void* input, void* output, size_t inputSize, size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, std::shared_ptr executor, int nBlocks = 0, int nThreadsPerBlock = 0, + bool symmetricMemory = false, const std::unordered_map& extras = {}) override; const std::string& name() const override; const std::string& collective() const override; @@ -274,6 +278,7 @@ class DslAlgorithm : public Algorithm, public AlgorithmBuilder, public std::enab CommResult execute(std::shared_ptr comm, const void* input, void* output, size_t inputSize, size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, std::shared_ptr executor, int nBlocks = 0, int nThreadsPerBlock = 0, + bool symmetricMemory = false, const std::unordered_map& extras = {}) override; AlgorithmType type() const override { return AlgorithmType::DSL; } Constraint constraint() const override; @@ -299,6 +304,7 @@ struct CollectiveRequest { const void* inputBuffer; void* outputBuffer; size_t messageSize; + cudaStream_t stream; const std::string& collective; const DataType dtype; const std::unordered_map>& hints; @@ -358,6 +364,10 @@ class AlgorithmCollection { AlgoSelectFunc fallbackAlgoSelector_ = nullptr; }; +/// Get a default GPU flag buffer (allocated once and reused). +/// @return A pair of (shared_ptr to the flag buffer, size in bytes). +std::pair, size_t> getDefaultFlagBuffer(); + } // namespace mscclpp #endif // MSCCLPP_ALGORITHM_HPP_ \ No newline at end of file diff --git a/include/mscclpp/env.hpp b/include/mscclpp/env.hpp index bd3983e9..39f73e8d 100644 --- a/include/mscclpp/env.hpp +++ b/include/mscclpp/env.hpp @@ -98,12 +98,13 @@ class Env { /// debugging purposes. Currently supports `all`, `broadcast`, `allreduce`, `reducescatter`, and `allgather`. const std::string forceNcclFallbackOperation; - /// Env name: `MSCCLPP_DISABLE_CHANNEL_CACHE`. If set to true, it will disable the channel cache for NCCL APIs. - /// Currently, this should be set to true if the application may call NCCL APIs on the same local buffer with - /// different remote buffers, e.g., in the case of a dynamic communicator. If CUDA/HIP graphs are used, disabling - /// the channel cache won't affect the performance, but otherwise it may lead to performance degradation. + /// Env name: `MSCCLPP_NCCL_SYMMETRIC_MEMORY`. If set to true, it indicates that the application uses symmetric memory + /// allocation across all ranks, making it safe to cache memory handles for all NCCL algorithms. If set to false, the + /// system will either use non-zero-copy algorithms (when CUDA/HIP graphs are not enabled) or set up new connections + /// every time (when CUDA/HIP graphs are enabled). This should be set to false if the application may call NCCL APIs + /// on the same local buffer with different remote buffers, e.g., in the case of a dynamic communicator. /// Default is false. - const bool disableChannelCache; + const bool ncclSymmetricMemory; /// Env name: `MSCCLPP_FORCE_DISABLE_NVLS`. If set to true, it will disable the NVLS support in MSCCL++. /// Default is false. diff --git a/include/mscclpp/ext/collectives/algorithm_collection_builder.hpp b/include/mscclpp/ext/collectives/algorithm_collection_builder.hpp index 201d7440..394e8014 100644 --- a/include/mscclpp/ext/collectives/algorithm_collection_builder.hpp +++ b/include/mscclpp/ext/collectives/algorithm_collection_builder.hpp @@ -47,7 +47,8 @@ class AlgorithmCollectionBuilder { /// @return The built AlgorithmCollection containing all registered algorithms. AlgorithmCollection build(); - AlgorithmCollection buildDefaultAlgorithms(uintptr_t scratchBuffer, size_t scratchBufferSize, int rank); + AlgorithmCollection buildDefaultAlgorithms(uintptr_t scratchBuffer, size_t scratchBufferSize, uintptr_t flagBuffer, + size_t flagBufferSize, int rank); private: AlgorithmCollectionBuilder() = default; @@ -55,7 +56,8 @@ class AlgorithmCollectionBuilder { AlgoSelectFunc algoSelector_ = nullptr; AlgoSelectFunc fallbackAlgoSelector_ = nullptr; - AlgorithmCollection buildDefaultNativeAlgorithms(uintptr_t scratchBuffer, size_t scratchBufferSize); + AlgorithmCollection buildDefaultNativeAlgorithms(uintptr_t scratchBuffer, size_t scratchBufferSize, + uintptr_t flagBuffer, size_t flagBufferSize); AlgorithmCollection buildDefaultDslAlgorithms(int rank); static std::shared_ptr gAlgorithmCollectionBuilder_; diff --git a/include/mscclpp/gpu.hpp b/include/mscclpp/gpu.hpp index 6a0929aa..b8d096e2 100644 --- a/include/mscclpp/gpu.hpp +++ b/include/mscclpp/gpu.hpp @@ -15,6 +15,7 @@ using cudaGraphExec_t = hipGraphExec_t; using cudaDeviceProp = hipDeviceProp_t; using cudaStream_t = hipStream_t; using cudaStreamCaptureMode = hipStreamCaptureMode; +using cudaStreamCaptureStatus = hipStreamCaptureStatus; using cudaMemcpyKind = hipMemcpyKind; using cudaIpcMemHandle_t = hipIpcMemHandle_t; @@ -35,6 +36,9 @@ constexpr auto cudaErrorNotSupported = hipErrorNotSupported; constexpr auto cudaStreamNonBlocking = hipStreamNonBlocking; constexpr auto cudaStreamCaptureModeGlobal = hipStreamCaptureModeGlobal; constexpr auto cudaStreamCaptureModeRelaxed = hipStreamCaptureModeRelaxed; +constexpr auto cudaStreamCaptureStatusNone = hipStreamCaptureStatusNone; +constexpr auto cudaStreamCaptureStatusActive = hipStreamCaptureStatusActive; +constexpr auto cudaStreamCaptureStatusInvalidated = hipStreamCaptureStatusInvalidated; constexpr auto cudaHostAllocMapped = hipHostMallocMapped; constexpr auto cudaHostAllocWriteCombined = hipHostMallocWriteCombined; constexpr auto cudaMemcpyDefault = hipMemcpyDefault; @@ -98,6 +102,7 @@ constexpr auto CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL = HIP_POINTER_ATTRIBUTE_DEVIC #define cudaStreamBeginCapture(...) hipStreamBeginCapture(__VA_ARGS__) #define cudaStreamEndCapture(...) hipStreamEndCapture(__VA_ARGS__) #define cudaStreamDestroy(...) hipStreamDestroy(__VA_ARGS__) +#define cudaStreamIsCapturing(...) hipStreamIsCapturing(__VA_ARGS__) #define cudaGraphCreate(...) hipGraphCreate(__VA_ARGS__) #define cudaGraphInstantiate(...) hipGraphInstantiate(__VA_ARGS__) #define cudaGraphLaunch(...) hipGraphLaunch(__VA_ARGS__) diff --git a/python/csrc/algorithm.cpp b/python/csrc/algorithm.cpp index 3553256a..c8365566 100644 --- a/python/csrc/algorithm.cpp +++ b/python/csrc/algorithm.cpp @@ -68,16 +68,17 @@ void register_algorithm(nb::module_& m) { "execute", [](Algorithm& self, std::shared_ptr comm, uintptr_t input, uintptr_t output, size_t inputSize, size_t outputSize, DataType dtype, ReduceOp op, uintptr_t stream, - std::shared_ptr executor, int nBlocks, int nThreadsPerBlock, + std::shared_ptr executor, int nBlocks, int nThreadsPerBlock, bool symmetricMemory, std::unordered_map extras) { return self.execute(comm, reinterpret_cast(input), reinterpret_cast(output), inputSize, outputSize, dtype, op, reinterpret_cast(stream), executor, - nBlocks, nThreadsPerBlock, extras); + nBlocks, nThreadsPerBlock, symmetricMemory, extras); }, nb::arg("comm"), nb::arg("input"), nb::arg("output"), nb::arg("input_size"), nb::arg("output_size"), nb::arg("dtype"), nb::arg("op") = ReduceOp::NOP, nb::arg("stream") = 0, nb::arg("executor") = nullptr, - nb::arg("n_blocks") = 0, nb::arg("n_threads_per_block") = 0, - nb::arg("extras") = std::unordered_map()); + nb::arg("n_blocks") = 0, nb::arg("n_threads_per_block") = 0, nb::arg("symmetric_memory") = false, + nb::arg("extras") = std::unordered_map()) + .def("reset", &Algorithm::reset); nb::class_(algorithmClass, "Constraint") .def(nb::init<>()) @@ -108,8 +109,17 @@ void register_algorithm(nb::module_& m) { .def_prop_ro("output_buffer", [](const CollectiveRequest& self) { return reinterpret_cast(self.outputBuffer); }) .def_ro("message_size", &CollectiveRequest::messageSize) + .def_prop_ro("stream", [](const CollectiveRequest& self) { return reinterpret_cast(self.stream); }) .def_prop_ro("collective", [](const CollectiveRequest& self) { return self.collective; }) .def_ro("dtype", &CollectiveRequest::dtype) .def_prop_ro("hints", [](const CollectiveRequest& self) { return self.hints; }) .def("buffer_mode", &CollectiveRequest::bufferMode); + + m.def( + "cpp_get_default_flag_buffer", + []() { + auto [buffer, size] = getDefaultFlagBuffer(); + return std::make_pair(reinterpret_cast(buffer.get()), size); + }, + "Get the default flag buffer. Returns a tuple of (buffer_ptr, buffer_size)."); } \ No newline at end of file diff --git a/python/csrc/core_py.cpp b/python/csrc/core_py.cpp index a862c7e5..7d1e37ba 100644 --- a/python/csrc/core_py.cpp +++ b/python/csrc/core_py.cpp @@ -44,7 +44,9 @@ void register_core(nb::module_& m) { .value("uint32", DataType::UINT32) .value("float16", DataType::FLOAT16) .value("float32", DataType::FLOAT32) - .value("bfloat16", DataType::BFLOAT16); + .value("bfloat16", DataType::BFLOAT16) + .value("float8_e4m3", DataType::FP8_E4M3) + .value("float8_e5m2", DataType::FP8_E5M2); nb::class_(m, "CppBootstrap") .def("get_rank", &Bootstrap::getRank) diff --git a/python/csrc/ext/algorithm_collection_builder_py.cpp b/python/csrc/ext/algorithm_collection_builder_py.cpp index 1a912724..be7f944e 100644 --- a/python/csrc/ext/algorithm_collection_builder_py.cpp +++ b/python/csrc/ext/algorithm_collection_builder_py.cpp @@ -29,6 +29,6 @@ void register_algorithm_collection_builder(nb::module_& m) { nb::arg("selector")) .def("build", &AlgorithmCollectionBuilder::build) .def("build_default_algorithms", &AlgorithmCollectionBuilder::buildDefaultAlgorithms, nb::arg("scratch_buffer"), - nb::arg("scratch_buffer_size"), nb::arg("rank")) + nb::arg("scratch_buffer_size"), nb::arg("flag_buffer"), nb::arg("flag_buffer_size"), nb::arg("rank")) .def_static("reset", &AlgorithmCollectionBuilder::reset); } \ No newline at end of file diff --git a/python/mscclpp/_core/algorithm.py b/python/mscclpp/_core/algorithm.py index 6c4a3f20..c712bf88 100644 --- a/python/mscclpp/_core/algorithm.py +++ b/python/mscclpp/_core/algorithm.py @@ -4,6 +4,7 @@ from __future__ import annotations from typing import Optional, Tuple, Dict from functools import cached_property +import cupy as cp from mscclpp._mscclpp import ( @@ -18,6 +19,7 @@ from mscclpp._mscclpp import ( CppReduceOp, CppAlgorithmBuilder, CppAlgorithmCollection, + cpp_get_default_flag_buffer, ) __all__ = ["Algorithm", "AlgorithmBuilder", "AlgorithmCollection"] @@ -160,6 +162,7 @@ class Algorithm: executor: Optional[CppExecutor] = None, nblocks=0, nthreads_per_block=0, + symmetric_memory: bool = False, extras: Optional[Dict[str, int]] = None, ) -> int: """Execute the collective algorithm. @@ -176,6 +179,7 @@ class Algorithm: executor: The executor for DSL algorithms (required for DSL, optional for native). nblocks: Number of CUDA blocks (0 for auto-selection). nthreads_per_block: Number of threads per block (0 for auto-selection). + symmetric_memory: Whether to use symmetric memory optimization (default: False). extras: Additional algorithm-specific parameters. Returns: @@ -193,9 +197,14 @@ class Algorithm: executor, nblocks, nthreads_per_block, + symmetric_memory, extras if extras is not None else {}, ) + def reset(self): + """Reset the internal state of the algorithm, if applicable.""" + self._algorithm.reset() + class AlgorithmBuilder: def __init__(self, algorithm_builder: CppAlgorithmBuilder): @@ -230,3 +239,17 @@ class AlgorithmCollection: """Register an algorithm for a collective operation.""" self._native_collection.register_algorithm(collective, algo_name, algorithm._algorithm) self._algorithms.append(algorithm) + + +def get_default_flag_buffer() -> cp.ndarray: + """Get the default flag buffer for algorithm selection. + + This buffer is used internally by default algorithms to store selection flags. + It is allocated as a shared GPU buffer and can be accessed from Python. + + Returns: + A CuPy array representing the flag buffer on the GPU. + """ + buffer_ptr, buffer_size = cpp_get_default_flag_buffer() + memptr = cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(buffer_ptr, buffer_size, None), 0) + return cp.ndarray((buffer_size // 4,), dtype=cp.uint32, memptr=memptr) diff --git a/python/mscclpp/ext/algorithm_collection_builder.py b/python/mscclpp/ext/algorithm_collection_builder.py index 8361bd2f..80c68909 100644 --- a/python/mscclpp/ext/algorithm_collection_builder.py +++ b/python/mscclpp/ext/algorithm_collection_builder.py @@ -3,7 +3,7 @@ from __future__ import annotations from typing import Union -from mscclpp._core.algorithm import Algorithm, AlgorithmBuilder, AlgorithmCollection +from mscclpp._core.algorithm import Algorithm, AlgorithmBuilder, AlgorithmCollection, get_default_flag_buffer import atexit from mscclpp._mscclpp import CppAlgorithmCollectionBuilder @@ -29,6 +29,7 @@ class AlgorithmCollectionBuilder: if not hasattr(self, "_initialized"): self._builder = CppAlgorithmCollectionBuilder.get_instance() self._initialized = True + self._flag_buffer = None def add_algorithm_builder(self, algorithm_builder: Union[AlgorithmBuilder, Algorithm]): if isinstance(algorithm_builder, AlgorithmBuilder): @@ -50,8 +51,17 @@ class AlgorithmCollectionBuilder: collection = self._builder.build() return AlgorithmCollection(collection) - def build_default_algorithms(self, scratch_buffer: int, scratch_buffer_size: int, rank: int) -> AlgorithmCollection: - native_collection = self._builder.build_default_algorithms(int(scratch_buffer), scratch_buffer_size, rank) + def build_default_algorithms( + self, + scratch_buffer: int, + scratch_buffer_size: int, + rank: int, + ) -> AlgorithmCollection: + if self._flag_buffer is None: + self._flag_buffer = get_default_flag_buffer() + native_collection = self._builder.build_default_algorithms( + int(scratch_buffer), scratch_buffer_size, self._flag_buffer.data.ptr, self._flag_buffer.nbytes, rank + ) return AlgorithmCollection(native_collection) diff --git a/python/mscclpp/utils.py b/python/mscclpp/utils.py index 69dd7ce6..e7b7381b 100644 --- a/python/mscclpp/utils.py +++ b/python/mscclpp/utils.py @@ -192,5 +192,11 @@ def torch_dtype_to_mscclpp_dtype(dtype: "torch.dtype") -> DataType: return DataType.int32 elif dtype == torch.bfloat16: return DataType.bfloat16 + # Hardware supports either OCP format or FNUZ format for float8. + # Mapping both to the same MSCClPP data type. + elif dtype == torch.float8_e5m2 or dtype == torch.float8_e5m2fnuz: + return DataType.float8_e5m2 + elif dtype == torch.float8_e4m3fn or dtype == torch.float8_e4m3fnuz: + return DataType.float8_e4m3 else: raise ValueError(f"Unknown data type: {dtype}") diff --git a/src/core/algorithm.cc b/src/core/algorithm.cc index 31c98f15..eaaeb4a1 100644 --- a/src/core/algorithm.cc +++ b/src/core/algorithm.cc @@ -3,6 +3,7 @@ #include #include +#include #include "logger.hpp" @@ -40,12 +41,12 @@ NativeAlgorithm::NativeAlgorithm(std::string name, std::string collective, InitF CommResult NativeAlgorithm::execute(std::shared_ptr comm, const void* input, void* output, size_t inputSize, size_t outputSize, DataType dtype, ReduceOp op, cudaStream_t stream, std::shared_ptr, int nBlocks, int nThreadsPerBlock, - const std::unordered_map& extras) { + bool symmetricMemory, const std::unordered_map& extras) { if (!initialized_) { initFunc_(comm); initialized_ = true; } - AlgorithmCtxKey ctxKey = contextKeyGenFunc_(input, output, inputSize, outputSize, dtype); + AlgorithmCtxKey ctxKey = contextKeyGenFunc_(input, output, inputSize, outputSize, dtype, symmetricMemory); auto it = contexts_.find(ctxKey); if (it == contexts_.end()) { auto ctx = contextInitFunc_(comm, input, output, inputSize, outputSize, dtype); @@ -155,7 +156,7 @@ Algorithm::Constraint DslAlgorithm::constraint() const { return constraint_; } CommResult DslAlgorithm::execute(std::shared_ptr comm, const void* input, void* output, size_t inputSize, size_t outputSize, DataType dtype, ReduceOp, cudaStream_t stream, - std::shared_ptr executor, int, int, + std::shared_ptr executor, int, int, bool, const std::unordered_map&) { if (!executor) { THROW(EXEC, Error, ErrorCode::InvalidUsage, "Executor is null in DslAlgorithm::execute"); @@ -198,4 +199,18 @@ std::shared_ptr DslAlgorithm::build() { return shared_from_this(); } // TODO: implement this void DslAlgorithm::reset() {} +static std::weak_ptr gDefaultFlagBuffer; +static size_t gDefaultFlagCount = 128; + +std::pair, size_t> getDefaultFlagBuffer() { + std::shared_ptr flagBuffer = gDefaultFlagBuffer.lock(); + if (!flagBuffer) { + flagBuffer = mscclpp::detail::gpuCallocShared(gDefaultFlagCount); + std::vector initFlags(gDefaultFlagCount, 1); + mscclpp::gpuMemcpy(flagBuffer.get(), initFlags.data(), gDefaultFlagCount, cudaMemcpyHostToDevice); + gDefaultFlagBuffer = flagBuffer; + } + return {flagBuffer, gDefaultFlagCount * sizeof(uint32_t)}; +} + } // namespace mscclpp \ No newline at end of file diff --git a/src/core/env.cpp b/src/core/env.cpp index a70e3d28..484b40af 100644 --- a/src/core/env.cpp +++ b/src/core/env.cpp @@ -64,7 +64,7 @@ Env::Env() cudaIpcUseDefaultStream(readEnv("MSCCLPP_CUDAIPC_USE_DEFAULT_STREAM", false)), ncclSharedLibPath(readEnv("MSCCLPP_NCCL_LIB_PATH", "")), forceNcclFallbackOperation(readEnv("MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION", "")), - disableChannelCache(readEnv("MSCCLPP_DISABLE_CHANNEL_CACHE", false)), + ncclSymmetricMemory(readEnv("MSCCLPP_NCCL_SYMMETRIC_MEMORY", false)), forceDisableNvls(readEnv("MSCCLPP_FORCE_DISABLE_NVLS", false)) {} std::shared_ptr env() { @@ -91,7 +91,7 @@ std::shared_ptr env() { logEnv("MSCCLPP_CUDAIPC_USE_DEFAULT_STREAM", globalEnv->cudaIpcUseDefaultStream); logEnv("MSCCLPP_NCCL_LIB_PATH", globalEnv->ncclSharedLibPath); logEnv("MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION", globalEnv->forceNcclFallbackOperation); - logEnv("MSCCLPP_DISABLE_CHANNEL_CACHE", globalEnv->disableChannelCache); + logEnv("MSCCLPP_NCCL_SYMMETRIC_MEMORY", globalEnv->ncclSymmetricMemory); logEnv("MSCCLPP_FORCE_DISABLE_NVLS", globalEnv->forceDisableNvls); } return globalEnv; diff --git a/src/ext/collectives/algorithm_collection_builder.cc b/src/ext/collectives/algorithm_collection_builder.cc index 67e616ae..1ede7519 100644 --- a/src/ext/collectives/algorithm_collection_builder.cc +++ b/src/ext/collectives/algorithm_collection_builder.cc @@ -49,8 +49,9 @@ AlgorithmCollection AlgorithmCollectionBuilder::build() { void AlgorithmCollectionBuilder::reset() { gAlgorithmCollectionBuilder_.reset(); } AlgorithmCollection AlgorithmCollectionBuilder::buildDefaultAlgorithms(uintptr_t scratchBuffer, - size_t scratchBufferSize, int rank) { - auto nativeCollection = buildDefaultNativeAlgorithms(scratchBuffer, scratchBufferSize); + size_t scratchBufferSize, uintptr_t flagBuffer, + size_t flagBufferSize, int rank) { + auto nativeCollection = buildDefaultNativeAlgorithms(scratchBuffer, scratchBufferSize, flagBuffer, flagBufferSize); auto dslCollection = buildDefaultDslAlgorithms(rank); nativeCollection.extend(dslCollection); nativeCollection.setSelectors(algoSelector_, fallbackAlgoSelector_); @@ -58,11 +59,15 @@ AlgorithmCollection AlgorithmCollectionBuilder::buildDefaultAlgorithms(uintptr_t } AlgorithmCollection AlgorithmCollectionBuilder::buildDefaultNativeAlgorithms(uintptr_t scratchBuffer, - size_t scratchBufferSize) { + size_t scratchBufferSize, + uintptr_t flagBuffer, + size_t flagBufferSize) { AlgorithmCollection collection; - auto allreduceAllpairPkt = std::make_shared(scratchBuffer, scratchBufferSize)->build(); + auto allreduceAllpairPkt = + std::make_shared(scratchBuffer, scratchBufferSize, flagBuffer, flagBufferSize)->build(); collection.registerAlgorithm(allreduceAllpairPkt->collective(), allreduceAllpairPkt->name(), allreduceAllpairPkt); - auto allreduceNvlsPacket = std::make_shared(scratchBuffer, scratchBufferSize)->build(); + auto allreduceNvlsPacket = + std::make_shared(scratchBuffer, scratchBufferSize, flagBuffer, flagBufferSize)->build(); collection.registerAlgorithm(allreduceNvlsPacket->collective(), allreduceNvlsPacket->name(), allreduceNvlsPacket); auto allreduceNvlsWithCopy = std::make_shared(scratchBuffer, scratchBufferSize)->build(); collection.registerAlgorithm(allreduceNvlsWithCopy->collective(), allreduceNvlsWithCopy->name(), @@ -70,7 +75,8 @@ AlgorithmCollection AlgorithmCollectionBuilder::buildDefaultNativeAlgorithms(uin auto allreduceNvlsWithCopy2 = std::make_shared(scratchBuffer, scratchBufferSize)->build(); collection.registerAlgorithm(allreduceNvlsWithCopy2->collective(), allreduceNvlsWithCopy2->name(), allreduceNvlsWithCopy2); - auto allreducePkt = std::make_shared(scratchBuffer, scratchBufferSize)->build(); + auto allreducePkt = + std::make_shared(scratchBuffer, scratchBufferSize, flagBuffer, flagBufferSize)->build(); collection.registerAlgorithm(allreducePkt->collective(), allreducePkt->name(), allreducePkt); auto allreduceNvls = std::make_shared()->build(); collection.registerAlgorithm(allreduceNvls->collective(), allreduceNvls->name(), allreduceNvls); diff --git a/src/ext/collectives/allgather/allgather_fullmesh.cu b/src/ext/collectives/allgather/allgather_fullmesh.cu index 34f8d4e7..0b288b38 100644 --- a/src/ext/collectives/allgather/allgather_fullmesh.cu +++ b/src/ext/collectives/allgather/allgather_fullmesh.cu @@ -170,7 +170,7 @@ std::shared_ptr AllgatherFullmesh::initAllgatherContext(std::shared_ptr AllgatherFullmesh::build() { [self](std::shared_ptr comm, const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype) { return self->initAllgatherContext(comm, input, output, inputSize, dtype); }, - [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype) { - return self->generateAllgatherContextKey(input, output, inputSize, dtype); + [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype, + bool symmetricMemory) { + return self->generateAllgatherContextKey(input, output, inputSize, dtype, symmetricMemory); }); } } // namespace collective diff --git a/src/ext/collectives/allgather/allgather_fullmesh_2.cu b/src/ext/collectives/allgather/allgather_fullmesh_2.cu index 84f14ca2..cf6027d9 100644 --- a/src/ext/collectives/allgather/allgather_fullmesh_2.cu +++ b/src/ext/collectives/allgather/allgather_fullmesh_2.cu @@ -107,12 +107,6 @@ __global__ void __launch_bounds__(1024, 1) } } -AllgatherFullmesh2::AllgatherFullmesh2() : disableChannelCache_(false) { - if (mscclpp::env()->disableChannelCache) { - disableChannelCache_ = true; - } -} - void AllgatherFullmesh2::initialize(std::shared_ptr comm) { this->conns_ = setupConnections(comm); this->memorySemaphores_ = setupMemorySemaphores(comm, this->conns_, nChannelsPerConnection_); @@ -174,7 +168,7 @@ std::shared_ptr AllgatherFullmesh2::initAllgatherContext(std::shared_ptrbootstrap()->getNranks(); recvBasePtr = (CUdeviceptr)output; @@ -197,10 +191,11 @@ std::shared_ptr AllgatherFullmesh2::initAllgatherContext(std::shared_ptr AllgatherFullmesh2::build() { [[maybe_unused]] size_t outputSize, mscclpp::DataType dtype) { return self->initAllgatherContext(comm, input, output, inputSize, dtype); }, [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, - mscclpp::DataType dtype) { return self->generateAllgatherContextKey(input, output, inputSize, dtype); }); + mscclpp::DataType dtype, bool symmetricMemory) { + return self->generateAllgatherContextKey(input, output, inputSize, dtype, symmetricMemory); + }); } } // namespace collective diff --git a/src/ext/collectives/allreduce/allreduce_allpair_packet.cu b/src/ext/collectives/allreduce/allreduce_allpair_packet.cu index f6081043..83950d7c 100644 --- a/src/ext/collectives/allreduce/allreduce_allpair_packet.cu +++ b/src/ext/collectives/allreduce/allreduce_allpair_packet.cu @@ -11,29 +11,18 @@ namespace mscclpp { namespace collective { -__device__ uint32_t deviceFlag = 1; - -template +template __global__ void allreduceAllPairs(T* buff, T* scratch, T* resultBuff, DeviceHandle* memoryChannels, size_t channelDataOffset, size_t scratchBufferSize, int rank, int nRanksPerNode, - int worldSize, size_t nelems, uint32_t numScratchBuff, void* flags) { + int worldSize, size_t nelems, uint32_t numScratchBuff, void* flags, + uint32_t flagSize) { // This version of allreduce only works for single nodes if (worldSize != nRanksPerNode) return; if (sizeof(T) == 2 || sizeof(T) == 1) nelems = (nelems * sizeof(T) + sizeof(T)) / sizeof(int); const int nPeers = nRanksPerNode - 1; - uint32_t flag = 0; - if constexpr (flagPerBlock) { - flag = ((uint32_t*)flags)[blockIdx.x]; - } else { - flag = deviceFlag; - __syncthreads(); - if (threadIdx.x == 0) { - ((LL8Packet*)flags)[blockIdx.x].write(0, flag); - } - } - + uint32_t flag = ((uint32_t*)flags)[blockIdx.x]; size_t scratchBaseOffset = (flag % numScratchBuff) ? (scratchBufferSize / numScratchBuff) : 0; size_t channelScratchOffset = scratchBaseOffset; @@ -62,22 +51,12 @@ __global__ void allreduceAllPairs(T* buff, T* scratch, T* resultBuff, DeviceHand } dst[idx] = data; } - if constexpr (flagPerBlock) { - __syncthreads(); - if (threadIdx.x == 0) { - ((uint32_t*)flags)[blockIdx.x] = flag + 1; - } - } else { - // Make sure all threadblocks have finished reading before incrementing the flag - if (blockIdx.x == 0 && threadIdx.x < gridDim.x) { - ((LL8Packet*)flags)[threadIdx.x].read(flag, -1); - } - if (blockIdx.x == 0) { - __syncthreads(); - } - if (threadIdx.x == 0 && blockIdx.x == 0) { - deviceFlag++; - } + __syncthreads(); + if (threadIdx.x == 0) { + ((uint32_t*)flags)[blockIdx.x] = flag + 1; + } + if (blockIdx.x == 0 && threadIdx.x >= gridDim.x && threadIdx.x < flagSize / sizeof(uint32_t)) { + ((uint32_t*)flags)[threadIdx.x] = flag + 1; } } @@ -93,19 +72,13 @@ struct AllpairAdapter { static cudaError_t call(const void* buff, void* scratch, void* resultBuff, void* memoryChannels, void*, DeviceHandle*, DeviceHandle*, size_t channelInOffset, size_t, size_t scratchBufferSize, int rank, int nRanksPerNode, int worldSize, size_t inputSize, - cudaStream_t stream, void* flags, uint32_t numScratchBuff, int nBlocks = 0, + cudaStream_t stream, void* flags, uint32_t flagSize, uint32_t numScratchBuff, int nBlocks = 0, int nThreadsPerBlock = 0) { using ChannelType = DeviceHandle; const size_t nelems = inputSize / sizeof(T); - if (nBlocks == 7 || nBlocks == 28) { - allreduceAllPairs<<>>( - (T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank, - nRanksPerNode, worldSize, nelems, numScratchBuff, flags); - return cudaGetLastError(); - } allreduceAllPairs<<>>( (T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank, - nRanksPerNode, worldSize, nelems, numScratchBuff, flags); + nRanksPerNode, worldSize, nelems, numScratchBuff, flags, flagSize); return cudaGetLastError(); } }; @@ -116,12 +89,6 @@ void AllreduceAllpairPacket::initialize(std::shared_ptr comm) { RegisteredMemory scratchMemory = comm->registerMemory(scratchBuffer_, scratchBufferSize_, Transport::CudaIpc); registeredMemories_ = setupRemoteMemories(comm, comm->bootstrap()->getRank(), scratchMemory); registeredMemories_.push_back(scratchMemory); - flags_ = detail::gpuCallocShared(maxBlockNum_); - std::vector flags(28, 1); - flags7_ = detail::gpuCallocShared(7); - flags28_ = detail::gpuCallocShared(28); - gpuMemcpy(flags7_.get(), flags.data(), 7, cudaMemcpyHostToDevice); - gpuMemcpy(flags28_.get(), flags.data(), 28, cudaMemcpyHostToDevice); } CommResult AllreduceAllpairPacket::allreduceKernelFunc(const std::shared_ptr ctx, const void* input, void* output, @@ -133,13 +100,6 @@ CommResult AllreduceAllpairPacket::allreduceKernelFunc(const std::shared_ptrworkSize); } - void* flags = this->flags_.get(); - if (blockAndThreadNum.first == 7) { - flags = this->flags7_.get(); - } else if (blockAndThreadNum.first == 28) { - flags = this->flags28_.get(); - } - size_t sendBytes; CUdeviceptr sendBasePtr; MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input)); @@ -150,10 +110,11 @@ CommResult AllreduceAllpairPacket::allreduceKernelFunc(const std::shared_ptr(dtype)); return CommResult::CommInvalidArgument; } - cudaError_t error = allreduce(input, this->scratchBuffer_, output, algoCtx->memoryChannelDeviceHandles.get(), nullptr, - nullptr, nullptr, channelInOffset, 0, this->scratchBufferSize_, algoCtx->rank, - algoCtx->nRanksPerNode, algoCtx->workSize, inputSize, stream, flags, - this->nSegmentsForScratchBuffer_, blockAndThreadNum.first, blockAndThreadNum.second); + cudaError_t error = + allreduce(input, this->scratchBuffer_, output, algoCtx->memoryChannelDeviceHandles.get(), nullptr, nullptr, + nullptr, channelInOffset, 0, this->scratchBufferSize_, algoCtx->rank, algoCtx->nRanksPerNode, + algoCtx->workSize, inputSize, stream, (void*)flagBuffer_, (uint32_t)flagBufferSize_, + this->nSegmentsForScratchBuffer_, blockAndThreadNum.first, blockAndThreadNum.second); if (error != cudaSuccess) { WARN("AllreducePacket failed with error: %s", cudaGetErrorString(error)); return CommResult::CommUnhandledCudaError; @@ -185,7 +146,7 @@ std::shared_ptr AllreduceAllpairPacket::initAllreduceContext(std::shared_p return ctx; } -AlgorithmCtxKey AllreduceAllpairPacket::generateAllreduceContextKey(const void* input, void*, size_t, DataType) { +AlgorithmCtxKey AllreduceAllpairPacket::generateAllreduceContextKey(const void* input, void*, size_t, DataType, bool) { size_t sendBytes; CUdeviceptr sendBasePtr; MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input)); @@ -193,7 +154,8 @@ AlgorithmCtxKey AllreduceAllpairPacket::generateAllreduceContextKey(const void* } std::shared_ptr AllreduceAllpairPacket::build() { - auto self = std::make_shared(reinterpret_cast(scratchBuffer_), scratchBufferSize_); + auto self = std::make_shared(reinterpret_cast(scratchBuffer_), scratchBufferSize_, + flagBuffer_, flagBufferSize_); return std::make_shared( "default_allreduce_allpair_packet", "allreduce", [self](std::shared_ptr comm) { self->initialize(comm); }, @@ -206,8 +168,9 @@ std::shared_ptr AllreduceAllpairPacket::build() { [self](std::shared_ptr comm, const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype) { return self->initAllreduceContext(comm, input, output, inputSize, dtype); }, - [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype) { - return self->generateAllreduceContextKey(input, output, inputSize, dtype); + [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype, + bool symmetricMemory) { + return self->generateAllreduceContextKey(input, output, inputSize, dtype, symmetricMemory); }); } } // namespace collective diff --git a/src/ext/collectives/allreduce/allreduce_fullmesh.cu b/src/ext/collectives/allreduce/allreduce_fullmesh.cu index d04766c1..13c63ba1 100644 --- a/src/ext/collectives/allreduce/allreduce_fullmesh.cu +++ b/src/ext/collectives/allreduce/allreduce_fullmesh.cu @@ -149,7 +149,8 @@ struct AllreduceAllconnectAdapter { static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void* memoryOutChannels, DeviceHandle*, DeviceHandle*, size_t, size_t channelOutDataOffset, size_t, int rank, int nRanksPerNode, int worldSize, - size_t inputSize, cudaStream_t stream, void*, uint32_t, int nBlocks, int nThreadsPerBlock) { + size_t inputSize, cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, + int nThreadsPerBlock) { using ChannelType = DeviceHandle; size_t nelems = inputSize / sizeof(T); if (nBlocks == 0) nBlocks = 35; @@ -180,8 +181,11 @@ CommResult AllreduceFullmesh::allreduceKernelFunc(const std::shared_ptr ct auto ctx = std::static_pointer_cast(ctx_void); size_t recvBytes; CUdeviceptr recvBasePtr; - MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)output)); - size_t channelOutOffset = (char*)output - (char*)recvBasePtr; + size_t channelOutOffset = 0; + if (symmetricMemory_) { + MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)output)); + channelOutOffset = (char*)output - (char*)recvBasePtr; + } std::shared_ptr> inputChannelHandles; if (this->memoryChannelsMap_.find(input) != this->memoryChannelsMap_.end()) { inputChannelHandles = this->memoryChannelsMap_[input].second; @@ -204,7 +208,7 @@ CommResult AllreduceFullmesh::allreduceKernelFunc(const std::shared_ptr ct cudaError_t error = allreduce(input, this->scratchBuffer_, output, inputChannelHandles.get(), ctx->memoryChannelDeviceHandles.get(), nullptr, nullptr, 0, channelOutOffset, 0, ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, - stream, nullptr, 0, numBlocksAndThreads.first, numBlocksAndThreads.second); + stream, nullptr, 0, 0, numBlocksAndThreads.first, numBlocksAndThreads.second); if (error != cudaSuccess) { WARN("AllreduceAllconnect failed with error: %s", cudaGetErrorString(error)); return CommResult::CommUnhandledCudaError; @@ -212,19 +216,21 @@ CommResult AllreduceFullmesh::allreduceKernelFunc(const std::shared_ptr ct return CommResult::CommSuccess; } -AlgorithmCtxKey AllreduceFullmesh::generateAllreduceContextKey(const void*, void* output, size_t, DataType) { +AlgorithmCtxKey AllreduceFullmesh::generateAllreduceContextKey(const void*, void* output, size_t, DataType, + bool symmetricMemory) { static int tag = 0; size_t recvBytes; CUdeviceptr recvBasePtr; MSCCLPP_CUTHROW(cuMemGetAddressRange(&recvBasePtr, &recvBytes, (CUdeviceptr)output)); - if (env()->disableChannelCache) { + symmetricMemory_ = symmetricMemory; + if (!symmetricMemory_) { return AlgorithmCtxKey{nullptr, (void*)recvBasePtr, 0, recvBytes, tag++}; } return AlgorithmCtxKey{nullptr, (void*)recvBasePtr, 0, recvBytes, 0}; } std::shared_ptr AllreduceFullmesh::initAllreduceContext(std::shared_ptr comm, const void*, - void* output, size_t, DataType) { + void* output, size_t size, DataType) { auto ctx = std::make_shared(); ctx->rank = comm->bootstrap()->getRank(); ctx->workSize = comm->bootstrap()->getNranks(); @@ -236,6 +242,10 @@ std::shared_ptr AllreduceFullmesh::initAllreduceContext(std::shared_ptrregisterMemory((void*)recvBasePtr, recvBytes, Transport::CudaIpc); ctx->registeredMemories = setupRemoteMemories(comm, ctx->rank, localMemory); ctx->memoryChannels = setupMemoryChannels(this->conns_, ctx->memorySemaphores, ctx->registeredMemories, localMemory, @@ -258,8 +268,9 @@ std::shared_ptr AllreduceFullmesh::build() { [self](std::shared_ptr comm, const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype) { return self->initAllreduceContext(comm, input, output, inputSize, dtype); }, - [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype) { - return self->generateAllreduceContextKey(input, output, inputSize, dtype); + [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype, + bool symmetricMemory) { + return self->generateAllreduceContextKey(input, output, inputSize, dtype, symmetricMemory); }); } } // namespace collective diff --git a/src/ext/collectives/allreduce/allreduce_nvls.cu b/src/ext/collectives/allreduce/allreduce_nvls.cu index 98f884f8..b07993a0 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls.cu @@ -23,9 +23,18 @@ __global__ void __launch_bounds__(1024, 1) int nBlocks = gridDim.x; int bid = blockIdx.x; size_t sizePerRank = size / nRanksPerNode; - size_t sizePerBlock = sizePerRank / nBlocks; + const size_t minAlign = 16; + // Align sizePerBlock to 16 bytes to ensure aligned vector access in handleMultiLoadReduceStore + size_t sizePerBlock = (sizePerRank + nBlocks - 1) / nBlocks; + sizePerBlock = (sizePerBlock + minAlign - 1) / minAlign * minAlign; + size_t rankOffset = sizePerRank * rank; size_t blockOffset = sizePerBlock * bid + rankOffset; + size_t curBlockSize = 0; + if (sizePerBlock * bid < sizePerRank) { + curBlockSize = min(sizePerBlock, sizePerRank - sizePerBlock * bid); + } + mscclpp::DeviceHandle* multicastPtr = multicast + bid; mscclpp::DeviceHandle* multicastOutPtr = multicastOut + bid; @@ -44,8 +53,10 @@ __global__ void __launch_bounds__(1024, 1) __syncthreads(); T* src = (T*)multicastPtr->mcPtr; T* dst = (T*)multicastOutPtr->mcPtr; - handleMultiLoadReduceStore(src, dst, blockOffset + channelInOffset, blockOffset + channelOutOffset, sizePerBlock, - threadIdx.x, blockDim.x); + if (curBlockSize > 0) { + handleMultiLoadReduceStore(src, dst, blockOffset + channelInOffset, blockOffset + channelOutOffset, curBlockSize, + threadIdx.x, blockDim.x); + } __syncthreads(); if (threadIdx.x < nPeers) { channels[threadIdx.x].relaxedSignal(); @@ -60,7 +71,7 @@ struct NvlsAdapter { mscclpp::DeviceHandle* nvlsChannels, mscclpp::DeviceHandle* nvlsOutChannels, size_t channelInOffset, size_t channelOutOffset, size_t, int rank, int nRanksPerNode, int, size_t inputSize, - cudaStream_t stream, void*, uint32_t, int nBlocks, int nThreadsPerBlock) { + cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) { #if (!defined(__CUDA_ARCH_SPECIFIC__) && !defined(__CUDA_ARCH_FAMILY_SPECIFIC__)) || (__CUDA_ARCH__ < 1000) if constexpr (std::is_same_v || std::is_same_v) { return cudaErrorNotSupported; @@ -77,7 +88,12 @@ struct NvlsAdapter { }; void AllreduceNvls::initialize(std::shared_ptr comm) { - nSwitchChannels_ = 8; + int device; + MSCCLPP_CUDATHROW(cudaGetDevice(&device)); + cudaDeviceProp deviceProp; + MSCCLPP_CUDATHROW(cudaGetDeviceProperties(&deviceProp, device)); + computeCapabilityMajor_ = deviceProp.major; + nSwitchChannels_ = 32; this->conns_ = setupConnections(comm); // setup semaphores std::vector> memorySemaphores = @@ -91,6 +107,10 @@ CommResult AllreduceNvls::allreduceKernelFunc(const std::shared_ptr ctx_vo size_t inputSize, mscclpp::DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock, const std::unordered_map&) { + if (!symmetricMemory_) { + WARN("AllreduceNvls requires symmetric memory for now."); + return CommResult::CommInvalidArgument; + } auto ctx = std::static_pointer_cast(ctx_void); AllreduceFunc allreduce = dispatch(op, dtype); if (!allreduce) { @@ -110,12 +130,16 @@ CommResult AllreduceNvls::allreduceKernelFunc(const std::shared_ptr ctx_vo } std::pair numBlocksAndThreads = {nBlocks, nThreadsPerBlock}; if (numBlocksAndThreads.first == 0 || numBlocksAndThreads.second == 0) { - numBlocksAndThreads = {ctx->nRanksPerNode, 1024}; + numBlocksAndThreads = {::min(ctx->nRanksPerNode, nSwitchChannels_), 1024}; + // For GB200 devices, using more blocks to improve the performances when nRanksPerNode <= 8 + if (computeCapabilityMajor_ == 10 && ctx->nRanksPerNode <= 8) { + numBlocksAndThreads.first = ::min(32, nSwitchChannels_); + } } cudaError_t error = allreduce(nullptr, nullptr, nullptr, this->memoryChannelsDeviceHandle_.get(), nullptr, nvlsChannels, nvlsOutChannels, channelInOffset, channelOutOffset, 0, ctx->rank, ctx->nRanksPerNode, ctx->workSize, - inputSize, stream, nullptr, 0, numBlocksAndThreads.first, numBlocksAndThreads.second); + inputSize, stream, nullptr, 0, 0, numBlocksAndThreads.first, numBlocksAndThreads.second); if (error != cudaSuccess) { WARN("AllreduceNvls failed with error: %s", cudaGetErrorString(error)); return CommResult::CommUnhandledCudaError; @@ -124,7 +148,8 @@ CommResult AllreduceNvls::allreduceKernelFunc(const std::shared_ptr ctx_vo } mscclpp::AlgorithmCtxKey AllreduceNvls::generateAllreduceContextKey(const void* input, void* output, size_t, - mscclpp::DataType) { + mscclpp::DataType, bool symmetricMemory) { + symmetricMemory_ = symmetricMemory; size_t sendBytes, recvBytes; CUdeviceptr sendBasePtr, recvBasePtr; MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input)); @@ -174,7 +199,9 @@ std::shared_ptr AllreduceNvls::build() { [[maybe_unused]] size_t outputSize, mscclpp::DataType dtype) { return self->initAllreduceContext(comm, input, output, inputSize, dtype); }, [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, - mscclpp::DataType dtype) { return self->generateAllreduceContextKey(input, output, inputSize, dtype); }); + mscclpp::DataType dtype, bool symmetricMemory) { + return self->generateAllreduceContextKey(input, output, inputSize, dtype, symmetricMemory); + }); } } // namespace collective } // namespace mscclpp diff --git a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu index bc7d596a..9f1371c2 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_packet.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_packet.cu @@ -9,25 +9,15 @@ namespace mscclpp { namespace collective { -__device__ uint32_t deviceFlag = 1; -template +template __global__ void __launch_bounds__(1024, 1) allreduceNvlsPacket([[maybe_unused]] const T* input, [[maybe_unused]] T* scratch, [[maybe_unused]] T* output, [[maybe_unused]] mscclpp::DeviceHandle* multicast, [[maybe_unused]] size_t nelems, [[maybe_unused]] size_t scratchBufferSize, - [[maybe_unused]] int rank, [[maybe_unused]] int worldSize, [[maybe_unused]] void* flags) { + [[maybe_unused]] int rank, [[maybe_unused]] int worldSize, [[maybe_unused]] void* flags, + [[maybe_unused]] uint32_t flagBufferSize) { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 - uint32_t flag = 0; - if constexpr (flagPerBlock) { - flag = ((uint32_t*)flags)[blockIdx.x]; - } else { - flag = deviceFlag; - __syncthreads(); - if (threadIdx.x == 0) { - ((LL8Packet*)flags)[blockIdx.x].write(0, flag); - } - } - + uint32_t flag = ((uint32_t*)flags)[blockIdx.x]; size_t scratchBaseOffset = (flag % 2) ? scratchBufferSize / 2 : 0; uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x; uint32_t nPktPerRank = nelems / worldSize / (sizeof(mscclpp::LL8Packet::Payload) / sizeof(T)); @@ -51,21 +41,13 @@ __global__ void __launch_bounds__(1024, 1) } dst[i] = data; } - if constexpr (flagPerBlock) { - __syncthreads(); - if (threadIdx.x == 0) { - ((uint32_t*)flags)[blockIdx.x] = flag + 1; - } - } else { - if (blockIdx.x == 0 && threadIdx.x < gridDim.x) { - ((LL8Packet*)flags)[threadIdx.x].read(flag, -1); - } - if (blockIdx.x == 0) { - __syncthreads(); - } - if (threadIdx.x == 0 && blockIdx.x == 0) { - deviceFlag++; - } + __syncthreads(); + if (threadIdx.x == 0) { + ((uint32_t*)flags)[blockIdx.x] = flag + 1; + } + // update other flags in-case using different number of blocks in next launch + if (blockIdx.x == 0 && (threadIdx.x > gridDim.x - 1) && (threadIdx.x < flagBufferSize / sizeof(uint32_t))) { + ((uint32_t*)flags)[threadIdx.x] = flag + 1; } #endif } @@ -85,30 +67,17 @@ struct AllreduceNvlsPacketAdapter { static cudaError_t call(const void* input, void* scratch, void* output, void*, void*, DeviceHandle* nvlsChannels, DeviceHandle*, size_t, size_t, size_t scratchBufferSize, int rank, int, int worldSize, size_t inputSize, cudaStream_t stream, - void* flags, uint32_t, int nBlocks, int nThreadsPerBlock) { - if (nBlocks == 4 || nBlocks == 8) { - allreduceNvlsPacket - <<>>((const T*)input, (T*)scratch, (T*)output, nvlsChannels, - inputSize / sizeof(T), scratchBufferSize, rank, worldSize, flags); - } else { - allreduceNvlsPacket - <<>>((const T*)input, (T*)scratch, (T*)output, nvlsChannels, - inputSize / sizeof(T), scratchBufferSize, rank, worldSize, flags); - } + void* flags, uint32_t flagBufferSize, uint32_t, int nBlocks, int nThreadsPerBlock) { + allreduceNvlsPacket<<>>( + (const T*)input, (T*)scratch, (T*)output, nvlsChannels, inputSize / sizeof(T), scratchBufferSize, rank, + worldSize, flags, flagBufferSize); return cudaGetLastError(); } }; -void AllreduceNvlsPacket::initialize(std::shared_ptr) { - std::vector flags(8, 1); - flags_ = detail::gpuCallocShared(16); - flags4_ = detail::gpuCallocShared(4); - flags8_ = detail::gpuCallocShared(8); - gpuMemcpy(flags4_.get(), flags.data(), 4, cudaMemcpyHostToDevice); - gpuMemcpy(flags8_.get(), flags.data(), 8, cudaMemcpyHostToDevice); -} +void AllreduceNvlsPacket::initialize(std::shared_ptr) {} -AlgorithmCtxKey AllreduceNvlsPacket::generateAllreduceContextKey(const void*, void*, size_t, DataType) { +AlgorithmCtxKey AllreduceNvlsPacket::generateAllreduceContextKey(const void*, void*, size_t, DataType, bool) { return AlgorithmCtxKey{nullptr, nullptr, 0, 0, 0}; } @@ -146,16 +115,10 @@ CommResult AllreduceNvlsPacket::allreduceKernelFunc(const std::shared_ptr WARN("Unsupported operation or data type for allreduce, dtype=%d", static_cast(dtype)); return CommResult::CommInvalidArgument; } - void* flags = this->flags_.get(); - if (blockAndThreadNum.first == 4) { - flags = this->flags4_.get(); - } else if (blockAndThreadNum.first == 8) { - flags = this->flags8_.get(); - } cudaError_t error = allreduce(input, this->scratchBuffer_, output, nullptr, nullptr, ctx->switchChannelDeviceHandles.get(), nullptr, - 0, 0, this->scratchBufferSize_, ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, stream, flags, - 0, blockAndThreadNum.first, blockAndThreadNum.second); + 0, 0, this->scratchBufferSize_, ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, stream, + (void*)flagBuffer_, (uint32_t)flagBufferSize_, 0, blockAndThreadNum.first, blockAndThreadNum.second); if (error != cudaSuccess) { WARN("AllreduceNvlsPacket failed with error: %s", cudaGetErrorString(error)); return CommResult::CommUnhandledCudaError; @@ -164,7 +127,8 @@ CommResult AllreduceNvlsPacket::allreduceKernelFunc(const std::shared_ptr } std::shared_ptr AllreduceNvlsPacket::build() { - auto self = std::make_shared((uintptr_t)scratchBuffer_, scratchBufferSize_); + auto self = std::make_shared((uintptr_t)scratchBuffer_, scratchBufferSize_, flagBuffer_, + flagBufferSize_); return std::make_shared( "default_allreduce_nvls_packet", "allreduce", [self](std::shared_ptr comm) { self->initialize(comm); }, @@ -178,7 +142,9 @@ std::shared_ptr AllreduceNvlsPacket::build() { [[maybe_unused]] size_t outputSize, mscclpp::DataType dtype) { return self->initAllreduceContext(comm, input, output, inputSize, dtype); }, [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, - mscclpp::DataType dtype) { return self->generateAllreduceContextKey(input, output, inputSize, dtype); }); + mscclpp::DataType dtype, bool symmetricMemory) { + return self->generateAllreduceContextKey(input, output, inputSize, dtype, symmetricMemory); + }); } } // namespace collective } // namespace mscclpp \ No newline at end of file diff --git a/src/ext/collectives/allreduce/allreduce_nvls_with_copy.cu b/src/ext/collectives/allreduce/allreduce_nvls_with_copy.cu index 113fdb7c..033f3311 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_with_copy.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_with_copy.cu @@ -113,7 +113,7 @@ struct NvlsWithCopyAdapter { static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void*, DeviceHandle* nvlsChannels, DeviceHandle*, size_t, size_t, size_t scratchBufferSize, int rank, int nRanksPerNode, int, size_t inputSize, - cudaStream_t stream, void*, uint32_t, int nBlocks, int nThreadsPerBlock) { + cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) { #if defined(__CUDA_ARCH__) // Skip the __CUDA_ARCH__ < 1000 since FP8 has not been supported for NVLS if constexpr (std::is_same_v || std::is_same_v) { return cudaErrorNotSupported; @@ -157,7 +157,7 @@ CommResult AllreduceNvlsWithCopy::allreduceKernelFunc(const std::shared_ptrscratchBuffer_, output, this->memoryChannelsDeviceHandle_.get(), nullptr, ctx->switchChannelDeviceHandles.get(), nullptr, 0, 0, this->scratchBufferSize_, - ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, stream, nullptr, 0, + ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, stream, nullptr, 0, 0, blockAndThreadNum.first, blockAndThreadNum.second); if (error != cudaSuccess) { WARN("AllreduceNvlsWithCopy failed with error: %s", cudaGetErrorString(error)); @@ -166,7 +166,7 @@ CommResult AllreduceNvlsWithCopy::allreduceKernelFunc(const std::shared_ptr AllreduceNvlsWithCopy::build() { [self](std::shared_ptr comm, const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype) { return self->initAllreduceContext(comm, input, output, inputSize, dtype); }, - [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype) { - return self->generateAllreduceContextKey(input, output, inputSize, dtype); + [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype, + bool symmetricMemory) { + return self->generateAllreduceContextKey(input, output, inputSize, dtype, symmetricMemory); }); } } // namespace collective diff --git a/src/ext/collectives/allreduce/allreduce_nvls_with_copy_2.cu b/src/ext/collectives/allreduce/allreduce_nvls_with_copy_2.cu index 2a109c6f..96aa9168 100644 --- a/src/ext/collectives/allreduce/allreduce_nvls_with_copy_2.cu +++ b/src/ext/collectives/allreduce/allreduce_nvls_with_copy_2.cu @@ -150,7 +150,7 @@ struct NvlsWithCopy2Adapter { static cudaError_t call(const void* input, void* scratch, void* output, void* memoryChannels, void*, DeviceHandle* nvlsChannels, DeviceHandle*, size_t, size_t, size_t scratchBufferSize, int rank, int nRanksPerNode, int, size_t inputSize, - cudaStream_t stream, void*, uint32_t, int nBlocks, int nThreadsPerBlock) { + cudaStream_t stream, void*, uint32_t, uint32_t, int nBlocks, int nThreadsPerBlock) { #if defined(__CUDA_ARCH__) // Skip the __CUDA_ARCH__ < 1000 since FP8 has not been supported for NVLS if constexpr (std::is_same_v || std::is_same_v) { return cudaErrorNotSupported; @@ -194,7 +194,7 @@ CommResult AllreduceNvlsWithCopy2::allreduceKernelFunc(const std::shared_ptrscratchBuffer_, output, this->memoryChannelsDeviceHandle_.get(), nullptr, ctx->switchChannelDeviceHandles.get(), nullptr, 0, 0, this->scratchBufferSize_, - ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, stream, nullptr, 0, + ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, stream, nullptr, 0, 0, blockAndThreadNum.first, blockAndThreadNum.second); if (error != cudaSuccess) { WARN("AllreduceNvlsWithCopy failed with error: %s", cudaGetErrorString(error)); @@ -203,7 +203,7 @@ CommResult AllreduceNvlsWithCopy2::allreduceKernelFunc(const std::shared_ptr AllreduceNvlsWithCopy2::build() { [self](std::shared_ptr comm, const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype) { return self->initAllreduceContext(comm, input, output, inputSize, dtype); }, - [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype) { - return self->generateAllreduceContextKey(input, output, inputSize, dtype); + [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype, + bool symmetricMemory) { + return self->generateAllreduceContextKey(input, output, inputSize, dtype, symmetricMemory); }); } diff --git a/src/ext/collectives/allreduce/allreduce_packet.cu b/src/ext/collectives/allreduce/allreduce_packet.cu index 23ed5d09..9ce67085 100644 --- a/src/ext/collectives/allreduce/allreduce_packet.cu +++ b/src/ext/collectives/allreduce/allreduce_packet.cu @@ -11,13 +11,11 @@ namespace mscclpp { namespace collective { -__device__ uint32_t deviceFlag = 1; - template __global__ void __launch_bounds__(1024, 1) allreducePacket(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle* memoryChannels, size_t channelDataOffset, size_t scratchBufferSize, int rank, int nRanksPerNode, int worldSize, - size_t nelems, void* flags, uint32_t numScratchBuff + size_t nelems, void* flags, uint32_t flagBufferSize, uint32_t numScratchBuff #if defined(ENABLE_NPKIT) , NpKitEventCollectContext* npKitEventCollectContexts, uint64_t* cpuTimestamp) { @@ -60,11 +58,7 @@ __global__ void __launch_bounds__(1024, 1) const int nPeers = nRanksPerNode - 1; const size_t nPkts = nelems / 2; - uint32_t flag = deviceFlag; - __syncthreads(); - if (threadIdx.x == 0) { - ((LL8Packet*)flags)[blockIdx.x].write(0, flag); - } + uint32_t flag = ((uint32_t*)flags)[blockIdx.x]; size_t channelScratchOffset = (flag % numScratchBuff) ? scratchBufferSize / numScratchBuff : 0; int nelemsPerRank = nelems / worldSize; @@ -129,15 +123,12 @@ __global__ void __launch_bounds__(1024, 1) result[idx].y = data.y; } - // Make sure all threadblocks have finished reading before incrementing the flag - if (blockIdx.x == 0 && threadIdx.x < gridDim.x) { - ((LL8Packet*)flags)[threadIdx.x].read(flag, -1); + __syncthreads(); + if (threadIdx.x == 0) { + ((uint32_t*)flags)[blockIdx.x] = flag + 1; } - if (blockIdx.x == 0) { - __syncthreads(); - } - if (threadIdx.x == 0 && blockIdx.x == 0) { - deviceFlag++; + if (blockIdx.x == 0 && (threadIdx.x > gridDim.x - 1) && (threadIdx.x < flagBufferSize / sizeof(uint32_t))) { + ((uint32_t*)flags)[threadIdx.x] = flag + 1; } #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_KERNEL_ALLREDUCE_ENTRY) && \ defined(ENABLE_NPKIT_EVENT_KERNEL_ALLREDUCE_EXIT) @@ -156,20 +147,22 @@ struct PacketAdapter { static cudaError_t call(const void* buff, void* scratch, void* resultBuff, void* memoryChannels, void*, DeviceHandle*, DeviceHandle*, size_t channelInOffset, size_t, size_t scratchBufferSize, int rank, int nRanksPerNode, int worldSize, size_t inputSize, - cudaStream_t stream, void* flags, uint32_t numScratchBuff, int nBlocks = 0, - int nThreadsPerBlock = 0) { + cudaStream_t stream, void* flags, uint32_t flagBufferSize, uint32_t numScratchBuff, + int nBlocks = 0, int nThreadsPerBlock = 0) { using ChannelType = DeviceHandle; const size_t nelems = inputSize / sizeof(T); + // Optimize the number of blocks to be multiple of (worldSize - 1) + nBlocks = nBlocks / (worldSize - 1) * (worldSize - 1); #if defined(ENABLE_NPKIT) size_t sharedMemSize = sizeof(NpKitEvent) * NPKIT_SHM_NUM_EVENTS; allreducePacket<<>>( (T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank, - nRanksPerNode, worldSize, nelems, flags, numScratchBuff, NpKit::GetGpuEventCollectContexts(), + nRanksPerNode, worldSize, nelems, flags, flagBufferSize, numScratchBuff, NpKit::GetGpuEventCollectContexts(), NpKit::GetCpuTimestamp()); #else allreducePacket<<>>( (T*)buff, (T*)scratch, (T*)resultBuff, (ChannelType*)memoryChannels, channelInOffset, scratchBufferSize, rank, - nRanksPerNode, worldSize, nelems, flags, numScratchBuff); + nRanksPerNode, worldSize, nelems, flags, flagBufferSize, numScratchBuff); #endif return cudaGetLastError(); } @@ -215,7 +208,6 @@ void AllreducePacket::initialize(std::shared_ptr comm) { RegisteredMemory scratchMemory = comm->registerMemory(scratchBuffer_, scratchBufferSize_, Transport::CudaIpc); registeredMemories_ = setupRemoteMemories(comm, comm->bootstrap()->getRank(), scratchMemory); registeredMemories_.push_back(scratchMemory); - flags_ = detail::gpuCallocShared(maxBlockNum_); } CommResult AllreducePacket::allreduceKernelFunc(const std::shared_ptr ctx_void, const void* input, void* output, @@ -233,7 +225,6 @@ CommResult AllreducePacket::allreduceKernelFunc(const std::shared_ptr ctx_ MSCCLPP_CUTHROW(cuMemGetAddressRange(&sendBasePtr, &sendBytes, (CUdeviceptr)input)); size_t channelInOffset = (char*)input - (char*)sendBasePtr; - void* flags = this->flags_.get(); AllreduceFunc allreduce = dispatch(op, dtype); if (!allreduce) { WARN("Unsupported operation or data type for allreduce: op=%d, dtype=%d", op, static_cast(dtype)); @@ -242,7 +233,8 @@ CommResult AllreducePacket::allreduceKernelFunc(const std::shared_ptr ctx_ cudaError_t error = allreduce(input, this->scratchBuffer_, output, ctx->memoryChannelDeviceHandles.get(), nullptr, nullptr, nullptr, channelInOffset, 0, this->scratchBufferSize_, ctx->rank, ctx->nRanksPerNode, ctx->workSize, inputSize, - stream, flags, this->nSegmentsForScratchBuffer_, blockAndThreadNum.first, blockAndThreadNum.second); + stream, (void*)flagBuffer_, (uint32_t)flagBufferSize_, this->nSegmentsForScratchBuffer_, + blockAndThreadNum.first, blockAndThreadNum.second); if (error != cudaSuccess) { WARN("AllreducePacket failed with error: %s", cudaGetErrorString(error)); return CommResult::CommUnhandledCudaError; @@ -274,7 +266,7 @@ std::shared_ptr AllreducePacket::initAllreduceContext(std::shared_ptr AllreducePacket::build() { - auto self = std::make_shared(reinterpret_cast(scratchBuffer_), scratchBufferSize_); + auto self = std::make_shared(reinterpret_cast(scratchBuffer_), scratchBufferSize_, + flagBuffer_, flagBufferSize_); return std::make_shared( "default_allreduce_packet", "allreduce", [self](std::shared_ptr comm) { self->initialize(comm); }, [self](const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, @@ -294,8 +287,9 @@ std::shared_ptr AllreducePacket::build() { [self](std::shared_ptr comm, const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype) { return self->initAllreduceContext(comm, input, output, inputSize, dtype); }, - [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype) { - return self->generateAllreduceContextKey(input, output, inputSize, dtype); + [self](const void* input, void* output, size_t inputSize, [[maybe_unused]] size_t outputSize, DataType dtype, + bool symmetricMemory) { + return self->generateAllreduceContextKey(input, output, inputSize, dtype, symmetricMemory); }); } diff --git a/src/ext/collectives/include/allgather/allgather_fullmesh.hpp b/src/ext/collectives/include/allgather/allgather_fullmesh.hpp index 085f4ac4..d1a4bbcd 100644 --- a/src/ext/collectives/include/allgather/allgather_fullmesh.hpp +++ b/src/ext/collectives/include/allgather/allgather_fullmesh.hpp @@ -25,7 +25,7 @@ class AllgatherFullmesh : public AlgorithmBuilder { std::shared_ptr initAllgatherContext(std::shared_ptr comm, const void*, void* output, size_t, mscclpp::DataType); - mscclpp::AlgorithmCtxKey generateAllgatherContextKey(const void*, void*, size_t, mscclpp::DataType); + mscclpp::AlgorithmCtxKey generateAllgatherContextKey(const void*, void*, size_t, mscclpp::DataType, bool); void* scratchBuffer_; size_t scratchBufferSize_; diff --git a/src/ext/collectives/include/allgather/allgather_fullmesh_2.hpp b/src/ext/collectives/include/allgather/allgather_fullmesh_2.hpp index ea176ba1..56783e3b 100644 --- a/src/ext/collectives/include/allgather/allgather_fullmesh_2.hpp +++ b/src/ext/collectives/include/allgather/allgather_fullmesh_2.hpp @@ -11,11 +11,11 @@ namespace collective { class AllgatherFullmesh2 : public AlgorithmBuilder { public: - AllgatherFullmesh2(); + AllgatherFullmesh2() = default; std::shared_ptr build() override; private: - bool disableChannelCache_; + bool symmetricMemory_; std::vector conns_; std::vector> memorySemaphores_; const int nChannelsPerConnection_ = 35; @@ -27,7 +27,7 @@ class AllgatherFullmesh2 : public AlgorithmBuilder { std::shared_ptr initAllgatherContext(std::shared_ptr comm, const void*, void* output, size_t, DataType); - AlgorithmCtxKey generateAllgatherContextKey(const void*, void*, size_t, DataType); + AlgorithmCtxKey generateAllgatherContextKey(const void*, void*, size_t, DataType, bool); }; } // namespace collective diff --git a/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp b/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp index e995b940..bd402cfa 100644 --- a/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp +++ b/src/ext/collectives/include/allreduce/allreduce_allpair_packet.hpp @@ -9,8 +9,11 @@ namespace mscclpp { namespace collective { class AllreduceAllpairPacket : public AlgorithmBuilder { public: - AllreduceAllpairPacket(uintptr_t scratchBuffer, size_t scratchBufferSize) - : scratchBuffer_((void*)scratchBuffer), scratchBufferSize_(scratchBufferSize){}; + AllreduceAllpairPacket(uintptr_t scratchBuffer, size_t scratchBufferSize, uintptr_t flagBuffer, size_t flagBufferSize) + : scratchBuffer_((void*)scratchBuffer), + scratchBufferSize_(scratchBufferSize), + flagBuffer_(flagBuffer), + flagBufferSize_(flagBufferSize){}; std::shared_ptr build() override; private: @@ -21,7 +24,7 @@ class AllreduceAllpairPacket : public AlgorithmBuilder { std::shared_ptr initAllreduceContext(std::shared_ptr comm, const void*, void* output, size_t, DataType); - AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType); + AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType, bool); void* scratchBuffer_; size_t scratchBufferSize_; @@ -30,9 +33,8 @@ class AllreduceAllpairPacket : public AlgorithmBuilder { std::vector conns_; std::vector> memorySemaphores_; std::vector registeredMemories_; - std::shared_ptr flags_; - std::shared_ptr flags7_; - std::shared_ptr flags28_; + uintptr_t flagBuffer_; + size_t flagBufferSize_; }; } // namespace collective } // namespace mscclpp \ No newline at end of file diff --git a/src/ext/collectives/include/allreduce/allreduce_fullmesh.hpp b/src/ext/collectives/include/allreduce/allreduce_fullmesh.hpp index 31a7f145..fa811b15 100644 --- a/src/ext/collectives/include/allreduce/allreduce_fullmesh.hpp +++ b/src/ext/collectives/include/allreduce/allreduce_fullmesh.hpp @@ -20,7 +20,7 @@ class AllreduceFullmesh : public mscclpp::AlgorithmBuilder { std::shared_ptr initAllreduceContext(std::shared_ptr comm, const void*, void* output, size_t, DataType); - AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType); + AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType, bool); void* scratchBuffer_; size_t scratchBufferSize_; std::shared_ptr comm_; @@ -32,6 +32,7 @@ class AllreduceFullmesh : public mscclpp::AlgorithmBuilder { RegisteredMemory localScratchMemory_; std::unordered_map, std::shared_ptr>>> memoryChannelsMap_; + bool symmetricMemory_ = false; }; } // namespace collective } // namespace mscclpp \ No newline at end of file diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls.hpp index 4591cb42..07074527 100644 --- a/src/ext/collectives/include/allreduce/allreduce_nvls.hpp +++ b/src/ext/collectives/include/allreduce/allreduce_nvls.hpp @@ -12,6 +12,7 @@ class AllreduceNvls : public AlgorithmBuilder { std::shared_ptr build() override; private: + bool symmetricMemory_ = false; void initialize(std::shared_ptr comm); CommResult allreduceKernelFunc(const std::shared_ptr ctx, const void* input, void* output, size_t inputSize, DataType dtype, ReduceOp op, cudaStream_t stream, int nBlocks, int nThreadsPerBlock, @@ -19,13 +20,14 @@ class AllreduceNvls : public AlgorithmBuilder { std::shared_ptr initAllreduceContext(std::shared_ptr comm, const void*, void* output, size_t, DataType); - AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType); + AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType, bool); const size_t nvlsBufferSize_ = (1 << 30); uint32_t nSwitchChannels_; std::shared_ptr> memoryChannelsDeviceHandle_; std::vector baseChannels_; std::vector conns_; + int computeCapabilityMajor_{0}; }; } // namespace collective diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_packet.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_packet.hpp index 8761162a..1cfb5ffd 100644 --- a/src/ext/collectives/include/allreduce/allreduce_nvls_packet.hpp +++ b/src/ext/collectives/include/allreduce/allreduce_nvls_packet.hpp @@ -10,8 +10,11 @@ namespace mscclpp { namespace collective { class AllreduceNvlsPacket : public mscclpp::AlgorithmBuilder { public: - AllreduceNvlsPacket(uintptr_t scratchBuffer, size_t scratchBufferSize) - : scratchBuffer_((void*)scratchBuffer), scratchBufferSize_(scratchBufferSize){}; + AllreduceNvlsPacket(uintptr_t scratchBuffer, size_t scratchBufferSize, uintptr_t flagBuffer, size_t flagBufferSize) + : scratchBuffer_((void*)scratchBuffer), + scratchBufferSize_(scratchBufferSize), + flagBuffer_(flagBuffer), + flagBufferSize_(flagBufferSize){}; std::shared_ptr build() override; private: @@ -22,15 +25,14 @@ class AllreduceNvlsPacket : public mscclpp::AlgorithmBuilder { std::shared_ptr initAllreduceContext(std::shared_ptr comm, const void*, void* output, size_t, mscclpp::DataType); - mscclpp::AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, mscclpp::DataType); + mscclpp::AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, mscclpp::DataType, bool); void* scratchBuffer_; size_t scratchBufferSize_; const size_t nvlsBufferSize_ = (1 << 30); const int maxBlockNum_ = 16; - std::shared_ptr flags_; - std::shared_ptr flags4_; - std::shared_ptr flags8_; + uintptr_t flagBuffer_; + size_t flagBufferSize_; }; } // namespace collective } // namespace mscclpp diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_with_copy.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_with_copy.hpp index 1077b122..97b72a2f 100644 --- a/src/ext/collectives/include/allreduce/allreduce_nvls_with_copy.hpp +++ b/src/ext/collectives/include/allreduce/allreduce_nvls_with_copy.hpp @@ -20,7 +20,7 @@ class AllreduceNvlsWithCopy : public AlgorithmBuilder { std::shared_ptr initAllreduceContext(std::shared_ptr comm, const void*, void* output, size_t, DataType); - AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType); + AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType, bool); const size_t nvlsBufferSize_ = (1 << 30); void* scratchBuffer_; diff --git a/src/ext/collectives/include/allreduce/allreduce_nvls_with_copy_2.hpp b/src/ext/collectives/include/allreduce/allreduce_nvls_with_copy_2.hpp index 7bfa9822..ca4ed1c6 100644 --- a/src/ext/collectives/include/allreduce/allreduce_nvls_with_copy_2.hpp +++ b/src/ext/collectives/include/allreduce/allreduce_nvls_with_copy_2.hpp @@ -23,7 +23,7 @@ class AllreduceNvlsWithCopy2 : public AlgorithmBuilder { std::shared_ptr initAllreduceContext(std::shared_ptr comm, const void*, void* output, size_t, DataType); - AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType); + AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType, bool); const size_t nvlsBufferSize_ = (1 << 30); void* scratchBuffer_; diff --git a/src/ext/collectives/include/allreduce/allreduce_packet.hpp b/src/ext/collectives/include/allreduce/allreduce_packet.hpp index f562aca5..f0438dea 100644 --- a/src/ext/collectives/include/allreduce/allreduce_packet.hpp +++ b/src/ext/collectives/include/allreduce/allreduce_packet.hpp @@ -9,8 +9,11 @@ namespace mscclpp { namespace collective { class AllreducePacket : public AlgorithmBuilder { public: - AllreducePacket(uintptr_t scratchBuffer, size_t scratchBufferSize) - : scratchBuffer_((void*)scratchBuffer), scratchBufferSize_(scratchBufferSize){}; + AllreducePacket(uintptr_t scratchBuffer, size_t scratchBufferSize, uintptr_t flagBuffer, size_t flagBufferSize) + : scratchBuffer_((void*)scratchBuffer), + scratchBufferSize_(scratchBufferSize), + flagBuffer_(flagBuffer), + flagBufferSize_(flagBufferSize){}; std::shared_ptr build() override; private: @@ -21,16 +24,17 @@ class AllreducePacket : public AlgorithmBuilder { std::shared_ptr initAllreduceContext(std::shared_ptr comm, const void*, void* output, size_t, DataType); - AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType); + AlgorithmCtxKey generateAllreduceContextKey(const void*, void*, size_t, DataType, bool); void* scratchBuffer_; size_t scratchBufferSize_; const int nSegmentsForScratchBuffer_ = 2; const int maxBlockNum_ = 56; std::vector conns_; + uintptr_t flagBuffer_; + size_t flagBufferSize_; std::vector> memorySemaphores_; std::vector registeredMemories_; - std::shared_ptr flags_; }; } // namespace collective } // namespace mscclpp \ No newline at end of file diff --git a/src/ext/collectives/include/allreduce/common.hpp b/src/ext/collectives/include/allreduce/common.hpp index 26b57dbf..4c28a24a 100644 --- a/src/ext/collectives/include/allreduce/common.hpp +++ b/src/ext/collectives/include/allreduce/common.hpp @@ -75,7 +75,7 @@ MSCCLPP_DEVICE_INLINE void handleMultiLoadReduceStore(T* src, T* dst, size_t src using AllreduceFunc = std::function*, mscclpp::DeviceHandle*, size_t, size_t, size_t, int, int, int, - size_t, cudaStream_t, void*, uint32_t, int, int)>; + size_t, cudaStream_t, void*, uint32_t, uint32_t, int, int)>; template