mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-12 09:17:06 +00:00
Merge branch 'main' into chhwang/fix-ib-no-atomic
This commit is contained in:
@@ -44,7 +44,7 @@ jobs:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
mkdir build && cd build
|
||||
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON ..
|
||||
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON ..
|
||||
make -j
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
|
||||
@@ -19,7 +19,7 @@ steps:
|
||||
targetType: inline
|
||||
script: |
|
||||
mkdir build && cd build
|
||||
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
|
||||
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
|
||||
make -j
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
|
||||
@@ -27,7 +27,7 @@ steps:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
mkdir build && cd build
|
||||
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON ..
|
||||
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON ..
|
||||
make -j
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)/mscclpp'
|
||||
|
||||
|
||||
@@ -16,7 +16,7 @@ steps:
|
||||
targetType: 'inline'
|
||||
script: |
|
||||
mkdir build && cd build
|
||||
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
|
||||
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
|
||||
make -j
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
|
||||
@@ -63,7 +63,7 @@ steps:
|
||||
set -e; \
|
||||
cd /root/mscclpp; \
|
||||
mkdir -p build && cd build; \
|
||||
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} -DMSCCLPP_NPKIT_FLAGS=\"-DENABLE_NPKIT -DENABLE_NPKIT_EVENT_TIME_SYNC_CPU -DENABLE_NPKIT_EVENT_TIME_SYNC_GPU -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_EXIT -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT\" ..; \
|
||||
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} -DMSCCLPP_NPKIT_FLAGS=\"-DENABLE_NPKIT -DENABLE_NPKIT_EVENT_TIME_SYNC_CPU -DENABLE_NPKIT_EVENT_TIME_SYNC_GPU -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_EXIT -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT\" ..; \
|
||||
make -j"'
|
||||
kill $CHILD_PID
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
@@ -20,9 +20,9 @@ steps:
|
||||
script: |
|
||||
mkdir build && cd build
|
||||
if [ "${{ parameters.platform }}" == "rocm" ]; then
|
||||
CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
|
||||
CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
|
||||
else
|
||||
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
|
||||
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
|
||||
fi
|
||||
make -j
|
||||
workingDirectory: '$(System.DefaultWorkingDirectory)'
|
||||
|
||||
@@ -47,7 +47,7 @@ list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
|
||||
|
||||
# Options
|
||||
option(MSCCLPP_ENABLE_TRACE "Enable tracing" OFF)
|
||||
option(MSCCLPP_BUILD_TESTS "Build tests" ON)
|
||||
option(MSCCLPP_BUILD_TESTS "Build tests" OFF)
|
||||
option(MSCCLPP_BUILD_PYTHON_BINDINGS "Build Python bindings" ON)
|
||||
option(MSCCLPP_BUILD_EXT_NCCL "Build NCCL interfaces" ON)
|
||||
option(MSCCLPP_BUILD_EXT_COLLECTIVES "Build collective algorithms" ON)
|
||||
@@ -56,6 +56,7 @@ option(MSCCLPP_USE_ROCM "Use AMD/ROCm." OFF)
|
||||
option(MSCCLPP_USE_IB "Use InfiniBand." ON)
|
||||
option(MSCCLPP_BYPASS_GPU_CHECK "Bypass GPU check." OFF)
|
||||
option(MSCCLPP_NPKIT_FLAGS "Set NPKIT flags" OFF)
|
||||
option(MSCCLPP_DISABLE_NB_LEAK_WARNINGS "Disable Nanobind leak warnings" ON)
|
||||
set(MSCCLPP_GPU_ARCHS "" CACHE STRING "Specify GPU architectures with delimiters (comma, space, or semicolon).")
|
||||
|
||||
if(MSCCLPP_BYPASS_GPU_CHECK)
|
||||
@@ -185,7 +186,10 @@ if(MSCCLPP_USE_GDRCOPY)
|
||||
endif()
|
||||
|
||||
include(FetchContent)
|
||||
FetchContent_Declare(json URL https://github.com/nlohmann/json/releases/download/v3.11.3/json.tar.xz)
|
||||
FetchContent_Declare(json
|
||||
GIT_REPOSITORY https://github.com/nlohmann/json.git
|
||||
GIT_TAG v3.12.0
|
||||
)
|
||||
FetchContent_MakeAvailable(json)
|
||||
|
||||
if("${INSTALL_PREFIX}" STREQUAL "")
|
||||
|
||||
15
examples/tutorials/05-switch-channel/Makefile
Normal file
15
examples/tutorials/05-switch-channel/Makefile
Normal file
@@ -0,0 +1,15 @@
|
||||
CUDA_HOME ?= /usr/local/cuda
|
||||
|
||||
COMPILER := $(CUDA_HOME)/bin/nvcc
|
||||
ARCH_FLAG := -arch=native
|
||||
|
||||
TARGET = bidir_switch_channel
|
||||
SRC = bidir_switch_channel.cu
|
||||
|
||||
all: $(TARGET)
|
||||
|
||||
$(TARGET): $(SRC)
|
||||
$(COMPILER) $(ARCH_FLAG) -o $@ $< -lmscclpp
|
||||
|
||||
clean:
|
||||
rm -f $(TARGET)
|
||||
177
examples/tutorials/05-switch-channel/bidir_switch_channel.cu
Normal file
177
examples/tutorials/05-switch-channel/bidir_switch_channel.cu
Normal file
@@ -0,0 +1,177 @@
|
||||
// Copyright (c) Microsoft Corporation.
|
||||
// Licensed under the MIT license.
|
||||
|
||||
#include <sys/wait.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <functional>
|
||||
#include <iostream>
|
||||
#include <mscclpp/concurrency_device.hpp>
|
||||
#include <mscclpp/core.hpp>
|
||||
#include <mscclpp/gpu_utils.hpp>
|
||||
#include <mscclpp/switch_channel.hpp>
|
||||
#include <mscclpp/switch_channel_device.hpp>
|
||||
#include <sstream>
|
||||
|
||||
#define PORT_NUMBER "50505"
|
||||
|
||||
template <typename... Args>
|
||||
void log(Args &&...args) {
|
||||
std::stringstream ss;
|
||||
(ss << ... << args);
|
||||
ss << std::endl;
|
||||
std::cout << ss.str();
|
||||
}
|
||||
|
||||
int spawn_process(std::function<void()> func) {
|
||||
pid_t pid = fork();
|
||||
if (pid < 0) return -1;
|
||||
if (pid == 0) {
|
||||
// Child process
|
||||
func();
|
||||
exit(0);
|
||||
}
|
||||
return pid;
|
||||
}
|
||||
|
||||
int wait_process(int pid) {
|
||||
int status;
|
||||
if (waitpid(pid, &status, 0) < 0) {
|
||||
return -1;
|
||||
}
|
||||
if (WIFEXITED(status)) {
|
||||
return WEXITSTATUS(status);
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
__constant__ mscclpp::SwitchChannelDeviceHandle gConstSwitchChan;
|
||||
|
||||
__device__ mscclpp::DeviceSyncer devSyncer;
|
||||
|
||||
__global__ void kernelSwitchReduce(int rank, int numElements) {
|
||||
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
int stride = blockDim.x * gridDim.x;
|
||||
|
||||
// rank 0 performs on first half of data and rank 1 on second half
|
||||
int min = rank * (numElements / 2);
|
||||
int max = (rank + 1) * (numElements / 2);
|
||||
|
||||
for (int i = tid + min; i < max; i += stride) {
|
||||
auto val = gConstSwitchChan.reduce<mscclpp::f32x1>(i);
|
||||
gConstSwitchChan.broadcast(i, val);
|
||||
}
|
||||
}
|
||||
|
||||
void worker(int myRank, int gpuId, const std::string &ipPort) {
|
||||
MSCCLPP_CUDATHROW(cudaSetDevice(gpuId));
|
||||
const int nRanks = 2;
|
||||
const int iter = 1000;
|
||||
const size_t bufferBytes = 128 * 1024 * 1024;
|
||||
|
||||
log("Rank ", myRank, " (GPU ", gpuId, "): Preparing for tests ...");
|
||||
|
||||
// Build a connection and a semaphore
|
||||
auto bootstrap = std::make_shared<mscclpp::TcpBootstrap>(myRank, nRanks);
|
||||
bootstrap->initialize(ipPort);
|
||||
std::shared_ptr<mscclpp::Communicator> comm = std::make_shared<mscclpp::Communicator>(bootstrap);
|
||||
|
||||
std::vector<int> ranks;
|
||||
ranks.reserve(nRanks);
|
||||
for (int i = 0; i < nRanks; i++) ranks.push_back(i);
|
||||
|
||||
auto buffer = mscclpp::GpuBuffer<float>(bufferBytes);
|
||||
|
||||
auto nvlsConnection = mscclpp::connectNvlsCollective(comm, ranks, bufferBytes);
|
||||
|
||||
auto switchChannel = nvlsConnection->bindAllocatedMemory(CUdeviceptr(buffer.data()), bufferBytes);
|
||||
|
||||
auto deviceHandle = switchChannel.deviceHandle();
|
||||
|
||||
MSCCLPP_CUDATHROW(cudaMemcpyToSymbol(gConstSwitchChan, &deviceHandle, sizeof(deviceHandle)));
|
||||
MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
|
||||
|
||||
// Call the kernel in a loop for perf evaluation
|
||||
|
||||
for (size_t numElements : {1024, 1024 * 1024, 32 * 1024 * 1024}) {
|
||||
cudaEvent_t start, end;
|
||||
if (myRank == 0) {
|
||||
MSCCLPP_CUDATHROW(cudaEventCreate(&start));
|
||||
MSCCLPP_CUDATHROW(cudaEventCreate(&end));
|
||||
}
|
||||
MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
|
||||
bootstrap->barrier();
|
||||
|
||||
if (myRank == 0) {
|
||||
MSCCLPP_CUDATHROW(cudaEventRecord(start, 0));
|
||||
}
|
||||
|
||||
for (int i = 0; i < iter; ++i) {
|
||||
kernelSwitchReduce<<<256, 1024>>>(myRank, numElements);
|
||||
}
|
||||
|
||||
MSCCLPP_CUDATHROW(cudaGetLastError());
|
||||
MSCCLPP_CUDATHROW(cudaDeviceSynchronize());
|
||||
|
||||
comm->bootstrap()->barrier();
|
||||
|
||||
if (myRank == 0) {
|
||||
MSCCLPP_CUDATHROW(cudaEventRecord(end, 0));
|
||||
MSCCLPP_CUDATHROW(cudaEventSynchronize(end));
|
||||
float elapsedTime;
|
||||
float elapsedTimePerIter;
|
||||
float gbps;
|
||||
MSCCLPP_CUDATHROW(cudaEventElapsedTime(&elapsedTime, start, end));
|
||||
elapsedTimePerIter = elapsedTime / iter;
|
||||
float dataSize = numElements * 4;
|
||||
gbps = dataSize / elapsedTimePerIter * 1e-6f;
|
||||
log("Rank ", myRank, " (GPU ", gpuId, "): bytes ", dataSize, ", elapsed ", elapsedTimePerIter, " ms/iter, BW ",
|
||||
gbps, " GB/s");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
if (argc == 1) {
|
||||
int pid0 = spawn_process([]() { worker(0, 0, "lo:127.0.0.1:" PORT_NUMBER); });
|
||||
int pid1 = spawn_process([]() { worker(1, 1, "lo:127.0.0.1:" PORT_NUMBER); });
|
||||
if (pid0 < 0 || pid1 < 0) {
|
||||
log("Failed to spawn processes.");
|
||||
return -1;
|
||||
}
|
||||
int status0 = wait_process(pid0);
|
||||
int status1 = wait_process(pid1);
|
||||
if (status0 < 0 || status1 < 0) {
|
||||
log("Failed to wait for processes.");
|
||||
return -1;
|
||||
}
|
||||
if (status0 != 0 || status1 != 0) {
|
||||
log("One of the processes failed.");
|
||||
return -1;
|
||||
}
|
||||
log("Succeed!");
|
||||
return 0;
|
||||
} else if (argc == 4) {
|
||||
std::string ipPort = argv[1];
|
||||
int rank, gpuId;
|
||||
try {
|
||||
rank = std::stoi(argv[2]);
|
||||
gpuId = std::stoi(argv[3]);
|
||||
} catch (const std::exception &) {
|
||||
log("Error: rank and gpu_id must be valid integers.");
|
||||
return -1;
|
||||
}
|
||||
if (rank < 0 || rank > 2 || gpuId < 0) {
|
||||
log("Error: rank must be between 0 and 1 and gpu_id must be non-negative.");
|
||||
return -1;
|
||||
}
|
||||
worker(rank, gpuId, ipPort);
|
||||
log("Rank ", rank, ": Succeed!");
|
||||
return 0;
|
||||
} else {
|
||||
std::cerr << "Usage:\n"
|
||||
<< " " << argv[0] << " Run in intra-node mode\n"
|
||||
<< " " << argv[0] << " <ip_port> <rank> <gpu_id> Run in inter-node mode\n";
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
@@ -366,7 +366,7 @@ class AlgorithmCollection {
|
||||
|
||||
/// Get a default GPU flag buffer (allocated once and reused).
|
||||
/// @return A pair of (shared_ptr to the flag buffer, size in bytes).
|
||||
std::pair<std::shared_ptr<void>, size_t> getDefaultFlagBuffer();
|
||||
std::pair<std::shared_ptr<void>, size_t> getFlagBuffer();
|
||||
|
||||
} // namespace mscclpp
|
||||
|
||||
|
||||
@@ -4,6 +4,10 @@
|
||||
add_subdirectory(csrc)
|
||||
add_subdirectory(test)
|
||||
|
||||
target_compile_definitions(mscclpp_py PRIVATE
|
||||
$<$<BOOL:${MSCCLPP_DISABLE_NB_LEAK_WARNINGS}>:MSCCLPP_DISABLE_NB_LEAK_WARNINGS>
|
||||
)
|
||||
|
||||
add_custom_target(pytest_lib_copy ALL
|
||||
COMMAND ${CMAKE_COMMAND} -E copy_if_different
|
||||
${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/_mscclpp.*.so
|
||||
@@ -12,4 +16,4 @@ add_custom_target(pytest_lib_copy ALL
|
||||
${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/_ext.*.so
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/test/_cpp
|
||||
DEPENDS mscclpp_py mscclpp_py_test
|
||||
)
|
||||
)
|
||||
@@ -116,10 +116,15 @@ void register_algorithm(nb::module_& m) {
|
||||
.def("buffer_mode", &CollectiveRequest::bufferMode);
|
||||
|
||||
m.def(
|
||||
"cpp_get_default_flag_buffer",
|
||||
"cpp_get_flag_buffer",
|
||||
[]() {
|
||||
auto [buffer, size] = getDefaultFlagBuffer();
|
||||
return std::make_pair(reinterpret_cast<uintptr_t>(buffer.get()), size);
|
||||
auto [buffer, size] = getFlagBuffer();
|
||||
uintptr_t ptr = reinterpret_cast<uintptr_t>(buffer.get());
|
||||
// Transfer shared_ptr ownership into a capsule so Python's GC manages the lifetime.
|
||||
auto prevent = std::make_unique<std::shared_ptr<void>>(std::move(buffer));
|
||||
nb::capsule owner(prevent.get(), [](void* p) noexcept { delete static_cast<std::shared_ptr<void>*>(p); });
|
||||
prevent.release(); // capsule now owns the pointer
|
||||
return nb::make_tuple(ptr, size, owner);
|
||||
},
|
||||
"Get the default flag buffer. Returns a tuple of (buffer_ptr, buffer_size).");
|
||||
"Get the default flag buffer. Returns a tuple of (buffer_ptr, buffer_size, owner).");
|
||||
}
|
||||
@@ -307,6 +307,9 @@ void register_core(nb::module_& m) {
|
||||
}
|
||||
|
||||
NB_MODULE(_mscclpp, m) {
|
||||
#ifdef MSCCLPP_DISABLE_NB_LEAK_WARNINGS
|
||||
nb::set_leak_warnings(false);
|
||||
#endif
|
||||
register_env(m);
|
||||
register_error(m);
|
||||
register_port_channel(m);
|
||||
@@ -324,4 +327,4 @@ NB_MODULE(_mscclpp, m) {
|
||||
|
||||
// ext
|
||||
register_algorithm_collection_builder(m);
|
||||
}
|
||||
}
|
||||
@@ -19,7 +19,7 @@ from mscclpp._mscclpp import (
|
||||
CppReduceOp,
|
||||
CppAlgorithmBuilder,
|
||||
CppAlgorithmCollection,
|
||||
cpp_get_default_flag_buffer,
|
||||
cpp_get_flag_buffer,
|
||||
)
|
||||
|
||||
__all__ = ["Algorithm", "AlgorithmBuilder", "AlgorithmCollection"]
|
||||
@@ -241,15 +241,22 @@ class AlgorithmCollection:
|
||||
self._algorithms.append(algorithm)
|
||||
|
||||
|
||||
def get_default_flag_buffer() -> cp.ndarray:
|
||||
_flag_buffer_cache = None
|
||||
|
||||
|
||||
def get_flag_buffer() -> cp.ndarray:
|
||||
"""Get the default flag buffer for algorithm selection.
|
||||
|
||||
This buffer is used internally by default algorithms to store selection flags.
|
||||
It is allocated as a shared GPU buffer and can be accessed from Python.
|
||||
The result is cached so all callers share the same buffer.
|
||||
|
||||
Returns:
|
||||
A CuPy array representing the flag buffer on the GPU.
|
||||
"""
|
||||
buffer_ptr, buffer_size = cpp_get_default_flag_buffer()
|
||||
memptr = cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(buffer_ptr, buffer_size, None), 0)
|
||||
return cp.ndarray((buffer_size // 4,), dtype=cp.uint32, memptr=memptr)
|
||||
global _flag_buffer_cache
|
||||
if _flag_buffer_cache is None:
|
||||
buffer_ptr, buffer_size, owner = cpp_get_flag_buffer()
|
||||
memptr = cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(buffer_ptr, buffer_size, owner), 0)
|
||||
_flag_buffer_cache = cp.ndarray((buffer_size // 4,), dtype=cp.uint32, memptr=memptr)
|
||||
return _flag_buffer_cache
|
||||
|
||||
@@ -19,8 +19,8 @@ from mscclpp._mscclpp import (
|
||||
CppTransport,
|
||||
CppTransportFlags,
|
||||
)
|
||||
import mpi4py
|
||||
import numpy as np
|
||||
import pickle
|
||||
|
||||
from mscclpp.utils import is_torch_tensor
|
||||
|
||||
@@ -29,20 +29,40 @@ __all__ = ["CommGroup"]
|
||||
|
||||
class CommGroup:
|
||||
def __init__(
|
||||
self, mpi_comm: mpi4py.MPI.Comm = None, interfaceIpPortTrio: str = "", rank: int = None, size: int = None
|
||||
self,
|
||||
mpi_comm: "mpi4py.MPI.Comm" = None,
|
||||
torch_group: "dist.ProcessGroup" = None,
|
||||
interfaceIpPortTrio: str = "",
|
||||
rank: int = None,
|
||||
size: int = None,
|
||||
):
|
||||
if interfaceIpPortTrio == "":
|
||||
self.bootstrap = CppTcpBootstrap.create(mpi_comm.rank, mpi_comm.size)
|
||||
if interfaceIpPortTrio == "" and (mpi_comm is not None or torch_group is not None):
|
||||
uniq_id = None
|
||||
if mpi_comm.rank == 0:
|
||||
# similar to NCCL's unique id
|
||||
rank, size = (
|
||||
(mpi_comm.Get_rank(), mpi_comm.Get_size())
|
||||
if mpi_comm is not None
|
||||
else (torch_group.rank(), torch_group.size())
|
||||
)
|
||||
self.bootstrap = CppTcpBootstrap.create(rank, size)
|
||||
if rank == 0:
|
||||
uniq_id = self.bootstrap.create_unique_id()
|
||||
uniq_id_global = mpi_comm.bcast(uniq_id, 0)
|
||||
if mpi_comm is not None:
|
||||
import mpi4py
|
||||
|
||||
uniq_id_global = mpi_comm.bcast(uniq_id, 0)
|
||||
else:
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
|
||||
if rank == 0:
|
||||
uniq_id_global = uniq_id
|
||||
pickled_data = pickle.dumps(uniq_id)
|
||||
data_tensor = torch.frombuffer(bytearray(pickled_data), dtype=torch.uint8).clone()
|
||||
else:
|
||||
data_tensor = torch.zeros(256, dtype=torch.uint8)
|
||||
dist.broadcast(data_tensor, src=0, group=torch_group)
|
||||
uniq_id_global = pickle.loads(data_tensor.numpy().tobytes())
|
||||
self.bootstrap.initialize(uniq_id_global)
|
||||
elif mpi_comm:
|
||||
# use this instead
|
||||
self.bootstrap = CppTcpBootstrap.create(mpi_comm.rank, mpi_comm.size)
|
||||
self.bootstrap.initialize(interfaceIpPortTrio)
|
||||
elif not interfaceIpPortTrio == "":
|
||||
assert rank >= 0 and size >= 1
|
||||
self.bootstrap = CppTcpBootstrap.create(rank, size)
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
|
||||
from __future__ import annotations
|
||||
from typing import Union
|
||||
from mscclpp._core.algorithm import Algorithm, AlgorithmBuilder, AlgorithmCollection, get_default_flag_buffer
|
||||
from mscclpp._core.algorithm import Algorithm, AlgorithmBuilder, AlgorithmCollection, get_flag_buffer
|
||||
import atexit
|
||||
|
||||
from mscclpp._mscclpp import CppAlgorithmCollectionBuilder
|
||||
@@ -58,7 +58,7 @@ class AlgorithmCollectionBuilder:
|
||||
rank: int,
|
||||
) -> AlgorithmCollection:
|
||||
if self._flag_buffer is None:
|
||||
self._flag_buffer = get_default_flag_buffer()
|
||||
self._flag_buffer = get_flag_buffer()
|
||||
native_collection = self._builder.build_default_algorithms(
|
||||
int(scratch_buffer), scratch_buffer_size, self._flag_buffer.data.ptr, self._flag_buffer.nbytes, rank
|
||||
)
|
||||
|
||||
@@ -63,10 +63,13 @@ class MyProxyService {
|
||||
};
|
||||
|
||||
NB_MODULE(_ext, m) {
|
||||
#ifdef MSCCLPP_DISABLE_NB_LEAK_WARNINGS
|
||||
nb::set_leak_warnings(false);
|
||||
#endif
|
||||
nb::class_<MyProxyService>(m, "MyProxyService")
|
||||
.def(nb::init<int, int, int, nb::list, nb::list>(), nb::arg("rank"), nb::arg("nranks"), nb::arg("data_size"),
|
||||
nb::arg("reg_mem_list"), nb::arg("sem_list"))
|
||||
.def("fifo_device_handle", &MyProxyService::fifoDeviceHandle)
|
||||
.def("start", &MyProxyService::start)
|
||||
.def("stop", &MyProxyService::stop);
|
||||
}
|
||||
}
|
||||
@@ -199,18 +199,23 @@ std::shared_ptr<Algorithm> DslAlgorithm::build() { return shared_from_this(); }
|
||||
// TODO: implement this
|
||||
void DslAlgorithm::reset() {}
|
||||
|
||||
static std::weak_ptr<uint32_t> gDefaultFlagBuffer;
|
||||
static uint32_t* gDefaultFlagBuffer = nullptr;
|
||||
static std::weak_ptr<void> gDefaultFlagBufferWeak;
|
||||
static size_t gDefaultFlagCount = 128;
|
||||
|
||||
std::pair<std::shared_ptr<void>, size_t> getDefaultFlagBuffer() {
|
||||
std::shared_ptr<uint32_t> flagBuffer = gDefaultFlagBuffer.lock();
|
||||
if (!flagBuffer) {
|
||||
flagBuffer = mscclpp::detail::gpuCallocShared<uint32_t>(gDefaultFlagCount);
|
||||
std::vector<uint32_t> initFlags(gDefaultFlagCount, 1);
|
||||
mscclpp::gpuMemcpy(flagBuffer.get(), initFlags.data(), gDefaultFlagCount, cudaMemcpyHostToDevice);
|
||||
gDefaultFlagBuffer = flagBuffer;
|
||||
std::pair<std::shared_ptr<void>, size_t> getFlagBuffer() {
|
||||
auto ptr = gDefaultFlagBufferWeak.lock();
|
||||
if (!ptr) {
|
||||
if (!gDefaultFlagBuffer) {
|
||||
// Intentionally never freed — CUDA driver reclaims GPU memory at process exit.
|
||||
gDefaultFlagBuffer = static_cast<uint32_t*>(mscclpp::detail::gpuCalloc(gDefaultFlagCount * sizeof(uint32_t)));
|
||||
std::vector<uint32_t> initFlags(gDefaultFlagCount, 1);
|
||||
mscclpp::gpuMemcpy(gDefaultFlagBuffer, initFlags.data(), gDefaultFlagCount, cudaMemcpyHostToDevice);
|
||||
}
|
||||
ptr = std::shared_ptr<void>(gDefaultFlagBuffer, [](void*) {});
|
||||
gDefaultFlagBufferWeak = ptr;
|
||||
}
|
||||
return {flagBuffer, gDefaultFlagCount * sizeof(uint32_t)};
|
||||
return {ptr, gDefaultFlagCount * sizeof(uint32_t)};
|
||||
}
|
||||
|
||||
} // namespace mscclpp
|
||||
@@ -294,7 +294,7 @@ NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueI
|
||||
commPtr->scratchBuffer_ = mscclpp::GpuBuffer<char>(commPtr->scratchBufferSize_).memory();
|
||||
commPtr->executor = std::make_shared<mscclpp::Executor>(mscclppComm, commPtr->scratchBuffer_);
|
||||
|
||||
auto [buffer, size] = mscclpp::getDefaultFlagBuffer();
|
||||
auto [buffer, size] = mscclpp::getFlagBuffer();
|
||||
commPtr->flagBuffer_ = buffer;
|
||||
commPtr->flagBufferSize_ = size;
|
||||
|
||||
|
||||
@@ -1,7 +1,10 @@
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT license.
|
||||
|
||||
FetchContent_Declare(json URL https://github.com/nlohmann/json/releases/download/v3.11.2/json.tar.xz)
|
||||
FetchContent_Declare(json
|
||||
GIT_REPOSITORY https://github.com/nlohmann/json.git
|
||||
GIT_TAG v3.12.0
|
||||
)
|
||||
FetchContent_MakeAvailable(json)
|
||||
|
||||
function(add_mscclpp_test_executable name sources)
|
||||
|
||||
Reference in New Issue
Block a user