mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-13 17:55:52 +00:00
Merge pull request #29 from microsoft/crutcher-python
This PR adds the hooks for python binding and also adds testing environment for different functions available in mscclpp.h.
This commit is contained in:
3
.gitignore
vendored
3
.gitignore
vendored
@@ -1,2 +1,5 @@
|
||||
.vscode/
|
||||
build/
|
||||
__pycache__
|
||||
.*.swp
|
||||
.idea/
|
||||
|
||||
2
python/.gitignore
vendored
Normal file
2
python/.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
.*.swp
|
||||
.venv/
|
||||
72
python/CMakeLists.txt
Normal file
72
python/CMakeLists.txt
Normal file
@@ -0,0 +1,72 @@
|
||||
project(mscclpp)
|
||||
cmake_minimum_required(VERSION 3.18...3.22)
|
||||
find_package(Python 3.9 COMPONENTS Interpreter Development.Module REQUIRED)
|
||||
|
||||
if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
|
||||
set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE)
|
||||
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
|
||||
endif()
|
||||
|
||||
# Create CMake targets for all Python components needed by nanobind
|
||||
if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.26)
|
||||
find_package(Python 3.8 COMPONENTS Interpreter Development.Module Development.SABIModule REQUIRED)
|
||||
else()
|
||||
find_package(Python 3.8 COMPONENTS Interpreter Development.Module REQUIRED)
|
||||
endif()
|
||||
|
||||
# Detect the installed nanobind package and import it into CMake
|
||||
execute_process(
|
||||
COMMAND "${Python_EXECUTABLE}" -c "import nanobind; print(nanobind.cmake_dir())"
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_VARIABLE NB_DIR)
|
||||
list(APPEND CMAKE_PREFIX_PATH "${NB_DIR}")
|
||||
find_package(nanobind CONFIG REQUIRED)
|
||||
|
||||
set(CUDA_DIR "/usr/local/cuda")
|
||||
|
||||
set(MSCCLPP_DIR ${CMAKE_CURRENT_LIST_DIR}/../build)
|
||||
|
||||
nanobind_add_module(
|
||||
_py_mscclpp
|
||||
NOSTRIP
|
||||
NB_STATIC
|
||||
src/_py_mscclpp.cpp
|
||||
)
|
||||
|
||||
target_include_directories(
|
||||
_py_mscclpp
|
||||
PUBLIC
|
||||
${CUDA_DIR}/include
|
||||
${MSCCLPP_DIR}/include
|
||||
)
|
||||
target_link_directories(
|
||||
_py_mscclpp
|
||||
PUBLIC
|
||||
${CUDA_DIR}/lib
|
||||
${MSCCLPP_DIR}/lib
|
||||
)
|
||||
|
||||
target_link_libraries(
|
||||
_py_mscclpp
|
||||
PUBLIC
|
||||
mscclpp
|
||||
)
|
||||
|
||||
add_custom_target(build-package ALL DEPENDS _py_mscclpp)
|
||||
add_custom_command(
|
||||
TARGET build-package POST_BUILD
|
||||
COMMAND ${CMAKE_COMMAND} -E copy_directory
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/src/mscclpp
|
||||
${CMAKE_CURRENT_BINARY_DIR}/mscclpp)
|
||||
|
||||
add_custom_command(
|
||||
TARGET build-package POST_BUILD
|
||||
COMMAND ${CMAKE_COMMAND} -E copy
|
||||
$<TARGET_FILE:_py_mscclpp>
|
||||
${CMAKE_CURRENT_BINARY_DIR}/mscclpp/)
|
||||
|
||||
add_custom_command(
|
||||
TARGET build-package POST_BUILD
|
||||
COMMAND ${CMAKE_COMMAND} -E copy
|
||||
${MSCCLPP_DIR}/lib/libmscclpp.so
|
||||
${CMAKE_CURRENT_BINARY_DIR}/mscclpp/)
|
||||
|
||||
4
python/Makefile
Normal file
4
python/Makefile
Normal file
@@ -0,0 +1,4 @@
|
||||
|
||||
test:
|
||||
./test.sh
|
||||
|
||||
77
python/README.md
Normal file
77
python/README.md
Normal file
@@ -0,0 +1,77 @@
|
||||
# Python bindings
|
||||
|
||||
Test instructions:
|
||||
* Compile the `libmscclpp.so` library.
|
||||
* Install `cmake` verion >= 3.18
|
||||
* setup a python virtual env
|
||||
* `pip install -r requirements.txt`
|
||||
* `./tesh.sh`
|
||||
|
||||
Rough build attemtps
|
||||
```
|
||||
# cd to this directory:
|
||||
|
||||
# setup/enter pyenv environment for python 3.9
|
||||
|
||||
# install nanabind and the test requirements.
|
||||
pip install -r requirements.txt
|
||||
|
||||
# setup and build the CMake environments.
|
||||
# this requires nanobind, installed above.
|
||||
./setup.sh
|
||||
|
||||
# test the module
|
||||
pytest build/mscclpp
|
||||
```
|
||||
|
||||
|
||||
## Installing `gdrcopy` and `mpi`
|
||||
This assumes that some things are built/installed
|
||||
```
|
||||
# assumes WORKDIR has:
|
||||
# git clone git@github.com/NVIDIA/gdrcopy.git
|
||||
# git clone git@github.com:microsoft/mscclpp.git
|
||||
|
||||
uname -r
|
||||
# 5.4.0-1090-azure
|
||||
|
||||
# install
|
||||
|
||||
# break /usr/sbin/policy-rc.d so we can install modules
|
||||
echo '#!/bin/sh
|
||||
exit 0' > /usr/sbin/policy-rc.d
|
||||
|
||||
apt update
|
||||
apt install -y \
|
||||
build-essential devscripts debhelper check \
|
||||
libsubunit-dev fakeroot pkg-config dkms \
|
||||
linux-headers-5.4.0-1090-azure
|
||||
|
||||
apt install -y nvidia-dkms-525-server
|
||||
|
||||
|
||||
cd $WORKDIR/gdrcopy
|
||||
sed -i 's/\(-L \$(CUDA)\/lib64\)/\1 \1\/stubs/' tests/Makefile
|
||||
cd packages
|
||||
CUDA=/usr/local/cuda ./build-deb-packages.sh
|
||||
|
||||
dpkg -i gdrdrv-dkms_2.3-1_amd64.Ubuntu20_04.deb
|
||||
dpkg -i libgdrapi_2.3-1_amd64.Ubuntu20_04.deb
|
||||
dpkg -i gdrcopy-tests_2.3-1_amd64.Ubuntu20_04+cuda11.6.deb
|
||||
dpkg -i gdrcopy_2.3-1_amd64.Ubuntu20_04.deb
|
||||
|
||||
# validate:
|
||||
# $ sanity
|
||||
# Running suite(s): Sanity
|
||||
# 100%: Checks: 27, Failures: 0, Errors: 0
|
||||
|
||||
# dkms install -m gdrdrv/2.3
|
||||
|
||||
cd $WORKDIR/mscclpp
|
||||
|
||||
## numctl
|
||||
apt install -y numactl libnuma-dev libnuma1
|
||||
|
||||
# if not mpi testing
|
||||
USE_MPI_FOR_TESTS=0 make -j
|
||||
```
|
||||
24
python/ci.sh
Executable file
24
python/ci.sh
Executable file
@@ -0,0 +1,24 @@
|
||||
#!/bin/bash
|
||||
# CI hook script.
|
||||
|
||||
set -ex
|
||||
|
||||
# CD to this directory.
|
||||
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||
cd $SCRIPT_DIR
|
||||
|
||||
# clean env
|
||||
rm -rf .venv build
|
||||
|
||||
# setup a python virtual env
|
||||
python -m venv .venv
|
||||
|
||||
# activate the virtual env
|
||||
source .venv/bin/activate
|
||||
|
||||
# install venv deps.
|
||||
pip install -r requirements.txt
|
||||
|
||||
# run the build and test.
|
||||
./test.sh
|
||||
|
||||
9
python/format.sh
Executable file
9
python/format.sh
Executable file
@@ -0,0 +1,9 @@
|
||||
#!/bin/bash
|
||||
|
||||
clang-format -style='{
|
||||
"BasedOnStyle": "google",
|
||||
"BinPackParameters": false,
|
||||
"BinPackArguments": false,
|
||||
"AlignAfterOpenBracket": "AlwaysBreak"
|
||||
}' -i src/*.cpp
|
||||
|
||||
3
python/requirements.txt
Normal file
3
python/requirements.txt
Normal file
@@ -0,0 +1,3 @@
|
||||
nanobind
|
||||
pytest
|
||||
PyHamcrest
|
||||
6
python/setup.sh
Executable file
6
python/setup.sh
Executable file
@@ -0,0 +1,6 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -ex
|
||||
cmake -S . -B build
|
||||
cmake --build build --clean-first -v
|
||||
|
||||
262
python/src/_py_mscclpp.cpp
Normal file
262
python/src/_py_mscclpp.cpp
Normal file
@@ -0,0 +1,262 @@
|
||||
#include <mscclpp.h>
|
||||
#include <nanobind/nanobind.h>
|
||||
#include <nanobind/stl/string.h>
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <memory>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
|
||||
namespace nb = nanobind;
|
||||
using namespace nb::literals;
|
||||
|
||||
// This is a poorman's substitute for std::format, which is a C++20 feature.
|
||||
template <typename... Args>
|
||||
std::string string_format(const std::string &format, Args... args) {
|
||||
// Shutup format warning.
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wformat-security"
|
||||
|
||||
// Dry-run to the get the buffer size:
|
||||
// Extra space for '\0'
|
||||
int size_s = std::snprintf(nullptr, 0, format.c_str(), args...) + 1;
|
||||
if (size_s <= 0) {
|
||||
throw std::runtime_error("Error during formatting.");
|
||||
}
|
||||
|
||||
// allocate buffer
|
||||
auto size = static_cast<size_t>(size_s);
|
||||
std::unique_ptr<char[]> buf(new char[size]);
|
||||
|
||||
// actually format
|
||||
std::snprintf(buf.get(), size, format.c_str(), args...);
|
||||
|
||||
// Bulid the return string.
|
||||
// We don't want the '\0' inside
|
||||
return std::string(buf.get(), buf.get() + size - 1);
|
||||
|
||||
#pragma GCC diagnostic pop
|
||||
}
|
||||
|
||||
// Maybe return the value, maybe throw an exception.
|
||||
template <typename... Args>
|
||||
void checkResult(
|
||||
mscclppResult_t status, const std::string &format, Args... args) {
|
||||
switch (status) {
|
||||
case mscclppSuccess:
|
||||
return;
|
||||
|
||||
case mscclppUnhandledCudaError:
|
||||
case mscclppSystemError:
|
||||
case mscclppInternalError:
|
||||
case mscclppRemoteError:
|
||||
case mscclppInProgress:
|
||||
case mscclppNumResults:
|
||||
throw std::runtime_error(string_format(format, args...));
|
||||
|
||||
case mscclppInvalidArgument:
|
||||
case mscclppInvalidUsage:
|
||||
default:
|
||||
throw std::invalid_argument(string_format(format, args...));
|
||||
}
|
||||
}
|
||||
|
||||
// Maybe return the value, maybe throw an exception.
|
||||
template <typename Val, typename... Args>
|
||||
Val maybe(
|
||||
mscclppResult_t status, Val val, const std::string &format, Args... args) {
|
||||
checkResult(status, format, args...);
|
||||
return val;
|
||||
}
|
||||
|
||||
// Wrapper around connection state.
|
||||
struct MscclppComm {
|
||||
mscclppComm_t _handle;
|
||||
bool _is_open = false;
|
||||
|
||||
public:
|
||||
~MscclppComm() { close(); }
|
||||
|
||||
// Close should be safe to call on a closed handle.
|
||||
void close() {
|
||||
if (_is_open) {
|
||||
checkResult(mscclppCommDestroy(_handle), "Failed to close comm channel");
|
||||
_handle = 0;
|
||||
_is_open = false;
|
||||
}
|
||||
}
|
||||
|
||||
void check_open() {
|
||||
if (!_is_open) {
|
||||
throw std::invalid_argument("MscclppComm is not open");
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
static const std::string DOC_MscclppUniqueId =
|
||||
"MSCCLPP Unique Id; used by the MPI Interface";
|
||||
|
||||
static const std::string DOC_MscclppComm = "MSCCLPP Communications Handle";
|
||||
|
||||
|
||||
NB_MODULE(_py_mscclpp, m) {
|
||||
m.doc() = "Python bindings for MSCCLPP: which is not NCCL";
|
||||
|
||||
m.attr("MSCCLPP_UNIQUE_ID_BYTES") = MSCCLPP_UNIQUE_ID_BYTES;
|
||||
|
||||
nb::class_<mscclppUniqueId>(m, "MscclppUniqueId")
|
||||
.def_ro_static("__doc__", &DOC_MscclppUniqueId)
|
||||
.def_static(
|
||||
"from_context",
|
||||
[]() {
|
||||
mscclppUniqueId uniqueId;
|
||||
return maybe(
|
||||
mscclppGetUniqueId(&uniqueId),
|
||||
uniqueId,
|
||||
"Failed to get MSCCLP Unique Id.");
|
||||
},
|
||||
nb::call_guard<nb::gil_scoped_release>())
|
||||
.def_static(
|
||||
"from_bytes",
|
||||
[](nb::bytes source) {
|
||||
if (source.size() != MSCCLPP_UNIQUE_ID_BYTES) {
|
||||
throw std::invalid_argument(string_format(
|
||||
"Requires exactly %d bytes; found %d",
|
||||
MSCCLPP_UNIQUE_ID_BYTES,
|
||||
source.size()));
|
||||
}
|
||||
|
||||
mscclppUniqueId uniqueId;
|
||||
std::memcpy(
|
||||
uniqueId.internal, source.c_str(), sizeof(uniqueId.internal));
|
||||
return uniqueId;
|
||||
})
|
||||
.def("bytes", [](mscclppUniqueId id) {
|
||||
return nb::bytes(id.internal, sizeof(id.internal));
|
||||
});
|
||||
|
||||
nb::class_<MscclppComm>(m, "MscclppComm")
|
||||
.def_ro_static("__doc__", &DOC_MscclppComm)
|
||||
.def_static(
|
||||
"init_rank_from_address",
|
||||
[](const std::string &address, int rank, int world_size) {
|
||||
MscclppComm comm = {0};
|
||||
comm._is_open = true;
|
||||
return maybe(
|
||||
mscclppCommInitRank(
|
||||
&comm._handle, world_size, address.c_str(), rank),
|
||||
comm,
|
||||
"Failed to initialize comms: %s rank=%d world_size=%d",
|
||||
address,
|
||||
rank,
|
||||
world_size);
|
||||
},
|
||||
nb::call_guard<nb::gil_scoped_release>(),
|
||||
"address"_a,
|
||||
"rank"_a,
|
||||
"world_size"_a,
|
||||
"Initialize comms given an IP address, rank, and world_size")
|
||||
.def_static(
|
||||
"init_rank_from_id",
|
||||
[](const mscclppUniqueId &id, int rank, int world_size) {
|
||||
MscclppComm comm = {0};
|
||||
comm._is_open = true;
|
||||
return maybe(
|
||||
mscclppCommInitRankFromId(&comm._handle, world_size, id, rank),
|
||||
comm,
|
||||
"Failed to initialize comms: %02X%s rank=%d world_size=%d",
|
||||
id.internal,
|
||||
rank,
|
||||
world_size);
|
||||
},
|
||||
nb::call_guard<nb::gil_scoped_release>(),
|
||||
"id"_a,
|
||||
"rank"_a,
|
||||
"world_size"_a,
|
||||
"Initialize comms given u UniqueID, rank, and world_size")
|
||||
.def(
|
||||
"opened",
|
||||
[](MscclppComm &comm) { return comm._is_open; },
|
||||
"Is this comm object opened?")
|
||||
.def(
|
||||
"closed",
|
||||
[](MscclppComm &comm) { return !comm._is_open; },
|
||||
"Is this comm object closed?")
|
||||
.def(
|
||||
"rank",
|
||||
[](MscclppComm &comm) {
|
||||
comm.check_open();
|
||||
int rank;
|
||||
return maybe(
|
||||
mscclppCommRank(comm._handle, &rank),
|
||||
rank,
|
||||
"Failed to retrieve MSCCLPP rank");
|
||||
},
|
||||
nb::call_guard<nb::gil_scoped_release>(),
|
||||
"The rank of this node.")
|
||||
.def(
|
||||
"size",
|
||||
[](MscclppComm &comm) {
|
||||
comm.check_open();
|
||||
int size;
|
||||
return maybe(
|
||||
mscclppCommSize(comm._handle, &size),
|
||||
size,
|
||||
"Failed to retrieve MSCCLPP world size");
|
||||
},
|
||||
nb::call_guard<nb::gil_scoped_release>(),
|
||||
"The world size of this node.")
|
||||
.def(
|
||||
"connection_setup",
|
||||
[](MscclppComm &comm) {
|
||||
comm.check_open();
|
||||
return maybe(
|
||||
mscclppConnectionSetup(comm._handle),
|
||||
true,
|
||||
"Failed to settup MSCCLPP connection");
|
||||
},
|
||||
nb::call_guard<nb::gil_scoped_release>(),
|
||||
"Run connection setup for MSCCLPP.")
|
||||
.def(
|
||||
"launch_proxy",
|
||||
[](MscclppComm &comm) {
|
||||
comm.check_open();
|
||||
return maybe(
|
||||
mscclppProxyLaunch(comm._handle),
|
||||
true,
|
||||
"Failed to launch MSCCLPP proxy");
|
||||
},
|
||||
nb::call_guard<nb::gil_scoped_release>(),
|
||||
"Start the MSCCLPP proxy.")
|
||||
.def(
|
||||
"stop_proxy",
|
||||
[](MscclppComm &comm) {
|
||||
comm.check_open();
|
||||
return maybe(
|
||||
mscclppProxyStop(comm._handle),
|
||||
true,
|
||||
"Failed to stop MSCCLPP proxy");
|
||||
},
|
||||
nb::call_guard<nb::gil_scoped_release>(),
|
||||
"Start the MSCCLPP proxy.")
|
||||
.def(
|
||||
"close",
|
||||
&MscclppComm::close,
|
||||
nb::call_guard<nb::gil_scoped_release>())
|
||||
.def(
|
||||
"__del__",
|
||||
&MscclppComm::close,
|
||||
nb::call_guard<nb::gil_scoped_release>())
|
||||
.def(
|
||||
"bootstrap_all_gather",
|
||||
[](MscclppComm &comm, void *data, int size) {
|
||||
comm.check_open();
|
||||
return maybe(
|
||||
mscclppBootstrapAllGather(comm._handle, data, size),
|
||||
true,
|
||||
"Failed to stop MSCCLPP proxy");
|
||||
},
|
||||
nb::call_guard<nb::gil_scoped_release>());
|
||||
|
||||
}
|
||||
13
python/src/mscclpp/__init__.py
Normal file
13
python/src/mscclpp/__init__.py
Normal file
@@ -0,0 +1,13 @@
|
||||
from . import _py_mscclpp
|
||||
|
||||
__all__ = (
|
||||
"MscclppUniqueId",
|
||||
"MSCCLPP_UNIQUE_ID_BYTES",
|
||||
"MscclppComm",
|
||||
)
|
||||
|
||||
MscclppUniqueId = _py_mscclpp.MscclppUniqueId
|
||||
MSCCLPP_UNIQUE_ID_BYTES = _py_mscclpp.MSCCLPP_UNIQUE_ID_BYTES
|
||||
|
||||
MscclppComm = _py_mscclpp.MscclppComm
|
||||
|
||||
79
python/src/mscclpp/test_mscclpp.py
Normal file
79
python/src/mscclpp/test_mscclpp.py
Normal file
@@ -0,0 +1,79 @@
|
||||
import concurrent.futures
|
||||
import unittest
|
||||
import hamcrest
|
||||
|
||||
import mscclpp
|
||||
|
||||
|
||||
class UniqueIdTest(unittest.TestCase):
|
||||
def test_no_constructor(self) -> None:
|
||||
hamcrest.assert_that(
|
||||
hamcrest.calling(mscclpp.MscclppUniqueId).with_args(),
|
||||
hamcrest.raises(
|
||||
TypeError,
|
||||
"no constructor",
|
||||
),
|
||||
)
|
||||
|
||||
def test_getUniqueId(self) -> None:
|
||||
myId = mscclpp.MscclppUniqueId.from_context()
|
||||
|
||||
hamcrest.assert_that(
|
||||
myId.bytes(),
|
||||
hamcrest.has_length(mscclpp.MSCCLPP_UNIQUE_ID_BYTES),
|
||||
)
|
||||
|
||||
# from_bytes should work
|
||||
copy = mscclpp.MscclppUniqueId.from_bytes(myId.bytes())
|
||||
hamcrest.assert_that(
|
||||
copy.bytes(),
|
||||
hamcrest.equal_to(myId.bytes()),
|
||||
)
|
||||
|
||||
# bad size
|
||||
hamcrest.assert_that(
|
||||
hamcrest.calling(mscclpp.MscclppUniqueId.from_bytes).with_args(b'abc'),
|
||||
hamcrest.raises(
|
||||
ValueError,
|
||||
f"Requires exactly {mscclpp.MSCCLPP_UNIQUE_ID_BYTES} bytes; found 3"
|
||||
),
|
||||
)
|
||||
|
||||
def all_gather_task(rank: int, world_size: int) -> None:
|
||||
comm_options = dict(
|
||||
address="127.0.0.1:50000",
|
||||
rank=rank,
|
||||
world_size=world_size,
|
||||
)
|
||||
print(f'{comm_options=}', flush=True)
|
||||
|
||||
comm = mscclpp.MscclppComm.init_rank_from_address(**comm_options)
|
||||
|
||||
buf = bytearray(world_size)
|
||||
buf[rank] = rank
|
||||
|
||||
if False:
|
||||
# crashes, bad call structure..
|
||||
comm.bootstrap_all_gather(memoryview(buf), world_size)
|
||||
hamcrest.assert_that(
|
||||
buf,
|
||||
hamcrest.equal_to(b'\000\002'),
|
||||
)
|
||||
|
||||
comm.close()
|
||||
|
||||
|
||||
class CommsTest(unittest.TestCase):
|
||||
def test_all_gather(self) -> None:
|
||||
world_size = 2
|
||||
|
||||
tasks: list[concurrent.futures.Future[None]] = []
|
||||
|
||||
with concurrent.futures.ProcessPoolExecutor(max_workers=world_size) as pool:
|
||||
for rank in range(world_size):
|
||||
tasks.append(pool.submit(all_gather_task, rank, world_size))
|
||||
|
||||
for f in concurrent.futures.as_completed(tasks):
|
||||
f.result()
|
||||
|
||||
|
||||
11
python/test.sh
Executable file
11
python/test.sh
Executable file
@@ -0,0 +1,11 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -ex
|
||||
|
||||
if ! [ -d build ] ; then
|
||||
./setup.sh
|
||||
fi
|
||||
|
||||
cmake --build build
|
||||
|
||||
pytest build/mscclpp
|
||||
Reference in New Issue
Block a user