From be96f38ba397d384d4ebe290aef2575db836404f Mon Sep 17 00:00:00 2001 From: Crutcher Dunnavant Date: Thu, 23 Mar 2023 21:36:09 +0000 Subject: [PATCH 01/12] Work towards a nanobind wrapper --- .gitignore | 2 + python/.gitignore | 2 + python/CMakeLists.txt | 53 +++++++ python/Makefile | 4 + python/README.md | 60 ++++++++ python/mscclpp/__init__.py | 13 ++ .../__pycache__/__init__.cpython-39.pyc | Bin 0 -> 283 bytes .../test_mscclpp.cpython-39-pytest-7.2.0.pyc | Bin 0 -> 1652 bytes ..._py_mscclpp.cpython-39-x86_64-linux-gnu.so | 1 + python/mscclpp/test_mscclpp.py | 49 +++++++ python/requirements.txt | 3 + python/setup.sh | 7 + python/src/_py_mscclpp.cpp | 129 ++++++++++++++++++ python/test.sh | 11 ++ 14 files changed, 334 insertions(+) create mode 100644 python/.gitignore create mode 100644 python/CMakeLists.txt create mode 100644 python/Makefile create mode 100644 python/README.md create mode 100644 python/mscclpp/__init__.py create mode 100644 python/mscclpp/__pycache__/__init__.cpython-39.pyc create mode 100644 python/mscclpp/__pycache__/test_mscclpp.cpython-39-pytest-7.2.0.pyc create mode 120000 python/mscclpp/_py_mscclpp.cpython-39-x86_64-linux-gnu.so create mode 100644 python/mscclpp/test_mscclpp.py create mode 100644 python/requirements.txt create mode 100755 python/setup.sh create mode 100644 python/src/_py_mscclpp.cpp create mode 100755 python/test.sh diff --git a/.gitignore b/.gitignore index e524d792..1739c837 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ .vscode/ build/ +__pycache__ +.*.swp diff --git a/python/.gitignore b/python/.gitignore new file mode 100644 index 00000000..19bd21ec --- /dev/null +++ b/python/.gitignore @@ -0,0 +1,2 @@ +.*.swp + diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt new file mode 100644 index 00000000..8889ae1b --- /dev/null +++ b/python/CMakeLists.txt @@ -0,0 +1,53 @@ +project(mscclpp) +cmake_minimum_required(VERSION 3.18...3.22) +find_package(Python 3.9 COMPONENTS Interpreter Development.Module REQUIRED) + +if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) + set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE) + set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo") +endif() + +# Create CMake targets for all Python components needed by nanobind +if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.26) + find_package(Python 3.8 COMPONENTS Interpreter Development.Module Development.SABIModule REQUIRED) +else() + find_package(Python 3.8 COMPONENTS Interpreter Development.Module REQUIRED) +endif() + +# Detect the installed nanobind package and import it into CMake +execute_process( + COMMAND "${Python_EXECUTABLE}" -c "import nanobind; print(nanobind.cmake_dir())" + OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_VARIABLE NB_DIR) +list(APPEND CMAKE_PREFIX_PATH "${NB_DIR}") +find_package(nanobind CONFIG REQUIRED) + +set(CUDA_DIR "/usr/local/cuda") + +set(MSCCLPP_DIR ${CMAKE_CURRENT_LIST_DIR}/../build) + +nanobind_add_module( + _py_mscclpp + NOSTRIP + NB_SHARED + src/_py_mscclpp.cpp +) + +target_include_directories( + _py_mscclpp + PUBLIC + ${CUDA_DIR}/include + ${MSCCLPP_DIR}/include +) +target_link_directories( + _py_mscclpp + PUBLIC + ${CUDA_DIR}/lib + ${MSCCLPP_DIR}/lib +) + +target_link_libraries( + _py_mscclpp + PUBLIC + mscclpp +) + diff --git a/python/Makefile b/python/Makefile new file mode 100644 index 00000000..aa7c222c --- /dev/null +++ b/python/Makefile @@ -0,0 +1,4 @@ + +test: + ./test.sh + diff --git a/python/README.md b/python/README.md new file mode 100644 index 00000000..961d00ae --- /dev/null +++ b/python/README.md @@ -0,0 +1,60 @@ +This assumes that some things are built/installed +``` +# assumes WORKDIR has: +# git clone git@github.com/NVIDIA/gdrcopy.git +# git clone git@github.com:microsoft/mscclpp.git + +uname -r +# 5.4.0-1090-azure + +# install + +apt update +apt install -y \ + build-essential devscripts debhelper check \ + libsubunit-dev fakeroot pkg-config dkms \ + nvidia-dkms-525-server \ + linux-headers-5.4.0-1090-azure + + +cd $WORKDIR/gdrcopy +sed -i 's/\(-L \$(CUDA)\/lib64\)/\1 \1\/stubs/' tests/Makefile +cd packages +CUDA=/usr/local/cuda ./build-deb-packages.sh + +dpkg -i gdrdrv-dkms_2.3-1_amd64.Ubuntu20_04.deb +dpkg -i libgdrapi_2.3-1_amd64.Ubuntu20_04.deb +dpkg -i gdrcopy-tests_2.3-1_amd64.Ubuntu20_04+cuda11.6.deb +dpkg -i gdrcopy_2.3-1_amd64.Ubuntu20_04.deb + +# validate: +# $ sanity +# Running suite(s): Sanity +# 100%: Checks: 27, Failures: 0, Errors: 0 + +# dkms install -m gdrdrv/2.3 + +cd $WORKDIR/mscclpp + +## numctl +apt install -y numactl libnuma-dev libnuma1 + +# if not mpi testing +USE_MPI_FOR_TESTS=0 make -j +``` + + +Rough build attemtps +``` +# cd to this directory: + +cmake -S . -B build +cmake --build build --clean-first -v + +# this should contain libmscclpp.so, but does not +ldd build/py_mscclpp.cpython-39-x86_64-linux-gnu.so + +# this will fail due to a missing symbol +( cd build; + LD_LIBRARY_PATH="$PWD/../../build/lib:$LD_LIBRARY_PATH" python -c 'import py_mscclpp' ) +``` diff --git a/python/mscclpp/__init__.py b/python/mscclpp/__init__.py new file mode 100644 index 00000000..e825b92d --- /dev/null +++ b/python/mscclpp/__init__.py @@ -0,0 +1,13 @@ +from . import _py_mscclpp + +__all__ = ( + "MscclppUniqueId", + "MSCCLPP_UNIQUE_ID_BYTES", + "MscclppComm", +) + +MscclppUniqueId = _py_mscclpp.MscclppUniqueId +MSCCLPP_UNIQUE_ID_BYTES = _py_mscclpp.MSCCLPP_UNIQUE_ID_BYTES + +MscclppComm = _py_mscclpp.MscclppComm + diff --git a/python/mscclpp/__pycache__/__init__.cpython-39.pyc b/python/mscclpp/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..67d2f462357d383f2b6834085c8d1ba367796c37 GIT binary patch literal 283 zcmYe~<>g`kf@3>mQaBhH7#@Q-$bbpRaRB0CH6W3~kiwY5kjogw$jFew6vdRvn8h5$ z3?x~iSb!vJ6l)4|FoPz`OQ1;%nvA!&;|nU|bBmLca|#MHnQ!s?LYSd>nT4gPo+-D) zeS@8yeF6gFL;XAhLtW!NUE-Z0LtKMzaYI!)=jZ17X|mm7C<3|T7JGbrVopwcd=WE{ zSHuD$Sb@Y!h9V9S8$|rl)i27=FVRoVPf68>TCHDDS(1^T2j|AeXXa&=#K-FuRNmsS Z$<0qG%}KQbIk^~Q0uK`l8yh1V69BEsMaBRC literal 0 HcmV?d00001 diff --git a/python/mscclpp/__pycache__/test_mscclpp.cpython-39-pytest-7.2.0.pyc b/python/mscclpp/__pycache__/test_mscclpp.cpython-39-pytest-7.2.0.pyc new file mode 100644 index 0000000000000000000000000000000000000000..91fce504cdddcd4888539c3ccd55220efff36cce GIT binary patch literal 1652 zcmZ`(UvDEd5MO)$W}BwvkT{Ni3I`9DduXp9gaimxgbG#SqJy?b?2ENsyUm9Eld~Q5 z5>Z~bS3UtCC6D(t_!xfW>AnI;6*JzXq^Ni`vz{4yJf7c-)6LD;fsy?27XRCGoIg=m zAE0ygfm8yFJDe0wLhPTGE_H}gdWlaOOOOQC5+;#rzca{?uMZc5eIWG!n6PIjloL*WBi!ZQb1!k1lFtK>JnJnq?UO2h zIuVEbSSUSsK};Zpp2#XA;Ab^(9s>1&)ce4+Gbcb)yCZKw=KcaTfetxY&;`Y7La1~w z>r{0&t1G4DB-6F*MJ5`iWhUV)vymz#q-J`UY7=H@QRLOR2}_k_#dvJC9ygO!e5Q4g z>mf_!xq=weTnS}b$J4PmkPMg);`?iFSJt)O&1x=ommYV=Q$4J!Yf(cG zY-!sWPtB`VQPpfMV7#(Xfu%M7#6WN^e(!1*|dp97O;I)X4(7=_Cg{y5d=y>0;WQZFNtlaHCJVXZ57w-4A;m8Nq6a zH!$rpS(lh0EuLxVqdq{rZ}%keFjcG&)wv#;yN{3h{jZ-qVJCybZ%z)_;g{@-?~f0T zOl*BL8zMM9Ef~}Wrdnc;$qjTEf0q`MCAO~I1Xbdd(?f2&tR7D-$PL@V)!P<=bD>wb zY=c(a2j-BNdL$t6l^1LSfB|X4%p-R!m~8~}?bW7#!Cdpy^G(E9m6n1r6EjxUd{W@N z%^1esSR@`O?;^u;Sz{QVq`)vw3ZIFHbXr!EfpB9WD*Hg{4PgJ*#L!v3Y4z)}R2c9; z;;tL@hNmGkeIWG?Fk2fG`ybzeU))Oq73419RkDrh4`y3?AAG#?2)?~fK6(U0V#~io zM?KesDd$ikD)D4m{gA{LwJbPO`AqaYTN`F$*{m3qc`dJU&2TBRNH2FAKPze_dUT0q z25bM5C*vWG4>4>^lmjTuI@dR~eFi7K4i None: + hamcrest.assert_that( + hamcrest.calling(mscclpp.MscclppUniqueId).with_args(), + hamcrest.raises( + TypeError, + "no constructor", + ), + ) + + def test_getUniqueId(self) -> None: + myId = mscclpp.MscclppUniqueId.from_context() + + hamcrest.assert_that( + myId.bytes(), + hamcrest.has_length(mscclpp.MSCCLPP_UNIQUE_ID_BYTES), + ) + + # from_bytes should work + copy = mscclpp.MscclppUniqueId.from_bytes(myId.bytes()) + hamcrest.assert_that( + copy.bytes(), + hamcrest.equal_to(myId.bytes()), + ) + + # bad size + hamcrest.assert_that( + hamcrest.calling(mscclpp.MscclppUniqueId.from_bytes).with_args(b'abc'), + hamcrest.raises( + ValueError, + f"Requires exactly {mscclpp.MSCCLPP_UNIQUE_ID_BYTES} bytes; found 3" + ), + ) + + +class CommsTest(unittest.TestCase): + def _test(self) -> None: + # this hangs forever + comm = mscclpp.MscclppComm.init_rank_from_address( + address="127.0.0.1:50000", + rank=0, + world_size=2, + ) + comm.close() diff --git a/python/requirements.txt b/python/requirements.txt new file mode 100644 index 00000000..eaf386fd --- /dev/null +++ b/python/requirements.txt @@ -0,0 +1,3 @@ +nanobind +pytest +PyHamcrest diff --git a/python/setup.sh b/python/setup.sh new file mode 100755 index 00000000..a8eea6ed --- /dev/null +++ b/python/setup.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +set -ex +cmake -S . -B build +cmake --build build --clean-first -v +ldd build/py_mscclpp.cpython-39-x86_64-linux-gnu.so + diff --git a/python/src/_py_mscclpp.cpp b/python/src/_py_mscclpp.cpp new file mode 100644 index 00000000..e708fd47 --- /dev/null +++ b/python/src/_py_mscclpp.cpp @@ -0,0 +1,129 @@ +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace nb = nanobind; +using namespace nb::literals; + +// This is a poorman's substitute for std::format, which is a C++20 feature. +template +std::string string_format( const std::string& format, Args ... args ) +{ + int size_s = std::snprintf( nullptr, 0, format.c_str(), args ... ) + 1; // Extra space for '\0' + if( size_s <= 0 ){ throw std::runtime_error( "Error during formatting." ); } + auto size = static_cast( size_s ); + std::unique_ptr buf( new char[ size ] ); + std::snprintf( buf.get(), size, format.c_str(), args ... ); + return std::string( buf.get(), buf.get() + size - 1 ); // We don't want the '\0' inside +} + +template +Val maybe(mscclppResult_t status, Val val, const std::string& format, Args ... args) { + switch (status) { + case mscclppSuccess: + return val; + + case mscclppUnhandledCudaError: + case mscclppSystemError: + case mscclppInternalError: + case mscclppRemoteError: + case mscclppInProgress: + case mscclppNumResults: + throw std::runtime_error(string_format(format, args ...)); + + case mscclppInvalidArgument: + case mscclppInvalidUsage: + default: + throw std::invalid_argument(string_format(format, args ...)); + } +} + +struct MscclppComm { + mscclppComm_t internal; +}; + + +NB_MODULE(_py_mscclpp, m) { + m.doc() = "Python bindings for MSCCLPP"; + + m.attr("MSCCLPP_UNIQUE_ID_BYTES") = MSCCLPP_UNIQUE_ID_BYTES; + + nb::class_(m, "MscclppUniqueId") + .def_static("from_context", []() { + mscclppUniqueId uniqueId; + return maybe( + mscclppGetUniqueId(&uniqueId), + uniqueId, + "Failed to get MSCCLP Unique Id." + ); + }) + .def_static("from_bytes", [](nb::bytes source) { + if (source.size() != MSCCLPP_UNIQUE_ID_BYTES) { + throw std::invalid_argument( + string_format( + "Requires exactly %d bytes; found %d", + MSCCLPP_UNIQUE_ID_BYTES, + source.size() + ) + ); + } + + mscclppUniqueId uniqueId; + std::memcpy(uniqueId.internal, source.c_str(), sizeof(uniqueId.internal)); + return uniqueId; + }) + .def("bytes", [](mscclppUniqueId id){ + return nb::bytes(id.internal, sizeof(id.internal)); + }); + + nb::class_(m, "MscclppComm") + .def_static( + "init_rank_from_address", + [](const std::string &address, int rank, int world_size) { + MscclppComm comm = { 0 }; + return maybe( + mscclppCommInitRank(&comm.internal, world_size, rank, address.c_str()), + comm, + "Failed to initialize comms: %s rank=%d world_size=%d", + address, + rank, + world_size); + }, + "address"_a, "rank"_a, "world_size"_a, + "Initialize comms given an IP address, rank, and world_size" + ) + .def_static("init_rank_from_id", [](const mscclppUniqueId &id, int rank, int world_size) { + MscclppComm comm = { 0 }; + return maybe( + mscclppCommInitRankFromId(&comm.internal, world_size, id, rank), + comm, + "Failed to initialize comms: %02X%s rank=%d world_size=%d", + id.internal, + rank, + world_size); + }) + .def("close", [](MscclppComm &comm) { + maybe( + mscclppCommDestroy(comm.internal), + nb::none(), + "Failed to close comm channel" + ); + comm.internal = 0; + }) + .def("__del__", [](MscclppComm &comm) { + maybe( + mscclppCommDestroy(comm.internal), + nb::none(), + "Failed to close comm channel" + ); + comm.internal = 0; + }); + +} + diff --git a/python/test.sh b/python/test.sh new file mode 100755 index 00000000..f675d9ab --- /dev/null +++ b/python/test.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +set -ex + +if ! [ -d build ] ; then + ./setup.sh +fi + +cmake --build build + +pytest mscclpp From eb9b750830ea628033d2afb81241ac74c2d16914 Mon Sep 17 00:00:00 2001 From: Crutcher Dunnavant Date: Thu, 23 Mar 2023 22:14:52 +0000 Subject: [PATCH 02/12] format and guard --- .../test_mscclpp.cpython-39-pytest-7.2.0.pyc | Bin 1652 -> 1662 bytes python/src/_py_mscclpp.cpp | 200 +++++++++--------- 2 files changed, 100 insertions(+), 100 deletions(-) diff --git a/python/mscclpp/__pycache__/test_mscclpp.cpython-39-pytest-7.2.0.pyc b/python/mscclpp/__pycache__/test_mscclpp.cpython-39-pytest-7.2.0.pyc index 91fce504cdddcd4888539c3ccd55220efff36cce..4fd716c340f36b9eb65253eff985cdcfc07b71d3 100644 GIT binary patch delta 103 zcmeyu^N)uwk(ZZ?0SNXVl}Yhn-N^Tyh5Z&wQDR>9t;syB(e|wIC8@Z4G_?L~fir+auH@7$hD6a=rrO8qx0aQ{X2_mFGg!1G$tTzCU)*2iD delta 76 zcmeyz^M!{mk(ZZ?0SKZF$)x;d*~s^ug*}R;C^0WPYBCRNv>Z!GYH^7cP`;Q2q>539 av5L<*KR35H1SqaIxt~>xQE~Dz)*Ap90u;Ug diff --git a/python/src/_py_mscclpp.cpp b/python/src/_py_mscclpp.cpp index e708fd47..bfce09d3 100644 --- a/python/src/_py_mscclpp.cpp +++ b/python/src/_py_mscclpp.cpp @@ -1,129 +1,129 @@ +#include #include #include -#include #include #include #include -#include #include +#include namespace nb = nanobind; using namespace nb::literals; // This is a poorman's substitute for std::format, which is a C++20 feature. -template -std::string string_format( const std::string& format, Args ... args ) -{ - int size_s = std::snprintf( nullptr, 0, format.c_str(), args ... ) + 1; // Extra space for '\0' - if( size_s <= 0 ){ throw std::runtime_error( "Error during formatting." ); } - auto size = static_cast( size_s ); - std::unique_ptr buf( new char[ size ] ); - std::snprintf( buf.get(), size, format.c_str(), args ... ); - return std::string( buf.get(), buf.get() + size - 1 ); // We don't want the '\0' inside +template +std::string string_format(const std::string &format, Args... args) { + int size_s = std::snprintf(nullptr, 0, format.c_str(), args...) + + 1; // Extra space for '\0' + if (size_s <= 0) { + throw std::runtime_error("Error during formatting."); + } + auto size = static_cast(size_s); + std::unique_ptr buf(new char[size]); + std::snprintf(buf.get(), size, format.c_str(), args...); + return std::string(buf.get(), + buf.get() + size - 1); // We don't want the '\0' inside } -template -Val maybe(mscclppResult_t status, Val val, const std::string& format, Args ... args) { - switch (status) { - case mscclppSuccess: - return val; +template +Val maybe(mscclppResult_t status, Val val, const std::string &format, + Args... args) { + switch (status) { + case mscclppSuccess: + return val; - case mscclppUnhandledCudaError: - case mscclppSystemError: - case mscclppInternalError: - case mscclppRemoteError: - case mscclppInProgress: - case mscclppNumResults: - throw std::runtime_error(string_format(format, args ...)); + case mscclppUnhandledCudaError: + case mscclppSystemError: + case mscclppInternalError: + case mscclppRemoteError: + case mscclppInProgress: + case mscclppNumResults: + throw std::runtime_error(string_format(format, args...)); - case mscclppInvalidArgument: - case mscclppInvalidUsage: - default: - throw std::invalid_argument(string_format(format, args ...)); - } + case mscclppInvalidArgument: + case mscclppInvalidUsage: + default: + throw std::invalid_argument(string_format(format, args...)); + } } struct MscclppComm { mscclppComm_t internal; }; - NB_MODULE(_py_mscclpp, m) { - m.doc() = "Python bindings for MSCCLPP"; + m.doc() = "Python bindings for MSCCLPP: which is not NCCL"; - m.attr("MSCCLPP_UNIQUE_ID_BYTES") = MSCCLPP_UNIQUE_ID_BYTES; + m.attr("MSCCLPP_UNIQUE_ID_BYTES") = MSCCLPP_UNIQUE_ID_BYTES; - nb::class_(m, "MscclppUniqueId") - .def_static("from_context", []() { - mscclppUniqueId uniqueId; - return maybe( - mscclppGetUniqueId(&uniqueId), - uniqueId, - "Failed to get MSCCLP Unique Id." - ); - }) - .def_static("from_bytes", [](nb::bytes source) { - if (source.size() != MSCCLPP_UNIQUE_ID_BYTES) { - throw std::invalid_argument( - string_format( - "Requires exactly %d bytes; found %d", - MSCCLPP_UNIQUE_ID_BYTES, - source.size() - ) - ); - } + nb::class_(m, "MscclppUniqueId") + .def_static( + "from_context", + []() { + mscclppUniqueId uniqueId; + return maybe(mscclppGetUniqueId(&uniqueId), uniqueId, + "Failed to get MSCCLP Unique Id."); + }, + nb::call_guard()) + .def_static("from_bytes", + [](nb::bytes source) { + if (source.size() != MSCCLPP_UNIQUE_ID_BYTES) { + throw std::invalid_argument(string_format( + "Requires exactly %d bytes; found %d", + MSCCLPP_UNIQUE_ID_BYTES, source.size())); + } - mscclppUniqueId uniqueId; - std::memcpy(uniqueId.internal, source.c_str(), sizeof(uniqueId.internal)); - return uniqueId; - }) - .def("bytes", [](mscclppUniqueId id){ - return nb::bytes(id.internal, sizeof(id.internal)); - }); + mscclppUniqueId uniqueId; + std::memcpy(uniqueId.internal, source.c_str(), + sizeof(uniqueId.internal)); + return uniqueId; + }) + .def("bytes", [](mscclppUniqueId id) { + return nb::bytes(id.internal, sizeof(id.internal)); + }); nb::class_(m, "MscclppComm") - .def_static( - "init_rank_from_address", - [](const std::string &address, int rank, int world_size) { - MscclppComm comm = { 0 }; - return maybe( - mscclppCommInitRank(&comm.internal, world_size, rank, address.c_str()), - comm, - "Failed to initialize comms: %s rank=%d world_size=%d", - address, - rank, - world_size); - }, - "address"_a, "rank"_a, "world_size"_a, - "Initialize comms given an IP address, rank, and world_size" - ) - .def_static("init_rank_from_id", [](const mscclppUniqueId &id, int rank, int world_size) { - MscclppComm comm = { 0 }; - return maybe( - mscclppCommInitRankFromId(&comm.internal, world_size, id, rank), - comm, - "Failed to initialize comms: %02X%s rank=%d world_size=%d", - id.internal, - rank, - world_size); - }) - .def("close", [](MscclppComm &comm) { - maybe( - mscclppCommDestroy(comm.internal), - nb::none(), - "Failed to close comm channel" - ); - comm.internal = 0; - }) - .def("__del__", [](MscclppComm &comm) { - maybe( - mscclppCommDestroy(comm.internal), - nb::none(), - "Failed to close comm channel" - ); - comm.internal = 0; - }); - + .def_static( + "init_rank_from_address", + [](const std::string &address, int rank, int world_size) { + MscclppComm comm = {0}; + return maybe(mscclppCommInitRank(&comm.internal, world_size, rank, + address.c_str()), + comm, + "Failed to initialize comms: %s rank=%d world_size=%d", + address, rank, world_size); + }, + nb::call_guard(), "address"_a, "rank"_a, + "world_size"_a, + "Initialize comms given an IP address, rank, and world_size") + .def_static( + "init_rank_from_id", + [](const mscclppUniqueId &id, int rank, int world_size) { + MscclppComm comm = {0}; + return maybe( + mscclppCommInitRankFromId(&comm.internal, world_size, id, rank), + comm, + "Failed to initialize comms: %02X%s rank=%d world_size=%d", + id.internal, rank, world_size); + }, + nb::call_guard(), "id"_a, "rank"_a, + "world_size"_a, + "Initialize comms given u UniqueID, rank, and world_size") + .def( + "close", + [](MscclppComm &comm) { + maybe(mscclppCommDestroy(comm.internal), nb::none(), + "Failed to close comm channel"); + comm.internal = 0; + }, + nb::call_guard()) + .def( + "__del__", + [](MscclppComm &comm) { + maybe(mscclppCommDestroy(comm.internal), nb::none(), + "Failed to close comm channel"); + comm.internal = 0; + }, + nb::call_guard()); } - From 48e4bac1e0ab992c69adf6445fc879438930fff4 Mon Sep 17 00:00:00 2001 From: Crutcher Dunnavant Date: Fri, 24 Mar 2023 00:55:49 +0000 Subject: [PATCH 03/12] formatting and additional methods --- python/format.sh | 6 + python/src/_py_mscclpp.cpp | 227 ++++++++++++++++++++++++++++--------- 2 files changed, 180 insertions(+), 53 deletions(-) create mode 100755 python/format.sh diff --git a/python/format.sh b/python/format.sh new file mode 100755 index 00000000..7ee27183 --- /dev/null +++ b/python/format.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +clang-format \ + -style='{"BasedOnStyle": "google", "BinPackParameters": false, "BinPackArguments": false, "AlignAfterOpenBracket": "AlwaysBreak"}' \ + -i src/* + diff --git a/python/src/_py_mscclpp.cpp b/python/src/_py_mscclpp.cpp index bfce09d3..e485a8a8 100644 --- a/python/src/_py_mscclpp.cpp +++ b/python/src/_py_mscclpp.cpp @@ -14,116 +14,237 @@ using namespace nb::literals; // This is a poorman's substitute for std::format, which is a C++20 feature. template std::string string_format(const std::string &format, Args... args) { - int size_s = std::snprintf(nullptr, 0, format.c_str(), args...) + - 1; // Extra space for '\0' +// Shutup format warning. +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wformat-security" + + // Dry-run to the get the buffer size: + // Extra space for '\0' + int size_s = std::snprintf(nullptr, 0, format.c_str(), args...) + 1; if (size_s <= 0) { throw std::runtime_error("Error during formatting."); } + + // allocate buffer auto size = static_cast(size_s); std::unique_ptr buf(new char[size]); + + // actually format std::snprintf(buf.get(), size, format.c_str(), args...); - return std::string(buf.get(), - buf.get() + size - 1); // We don't want the '\0' inside + + // Bulid the return string. + // We don't want the '\0' inside + return std::string(buf.get(), buf.get() + size - 1); + +#pragma GCC diagnostic pop } -template -Val maybe(mscclppResult_t status, Val val, const std::string &format, - Args... args) { +// Maybe return the value, maybe throw an exception. +template +void checkResult( + mscclppResult_t status, const std::string &format, Args... args) { switch (status) { - case mscclppSuccess: - return val; + case mscclppSuccess: + return; - case mscclppUnhandledCudaError: - case mscclppSystemError: - case mscclppInternalError: - case mscclppRemoteError: - case mscclppInProgress: - case mscclppNumResults: - throw std::runtime_error(string_format(format, args...)); + case mscclppUnhandledCudaError: + case mscclppSystemError: + case mscclppInternalError: + case mscclppRemoteError: + case mscclppInProgress: + case mscclppNumResults: + throw std::runtime_error(string_format(format, args...)); - case mscclppInvalidArgument: - case mscclppInvalidUsage: - default: - throw std::invalid_argument(string_format(format, args...)); + case mscclppInvalidArgument: + case mscclppInvalidUsage: + default: + throw std::invalid_argument(string_format(format, args...)); } } +// Maybe return the value, maybe throw an exception. +template +Val maybe( + mscclppResult_t status, Val val, const std::string &format, Args... args) { + checkResult(status, format, args...); + return val; +} + +// Wrapper around connection state. struct MscclppComm { - mscclppComm_t internal; + mscclppComm_t _handle; + bool _is_open = false; + + public: + ~MscclppComm() { close(); } + + // Close should be safe to call on a closed handle. + void close() { + if (_is_open) { + checkResult(mscclppCommDestroy(_handle), "Failed to close comm channel"); + _handle = 0; + _is_open = false; + } + } + + void check_open() { + if (!_is_open) { + throw std::invalid_argument("MscclppComm is not open"); + } + } }; +static const std::string DOC_MscclppUniqueId = + "MSCCLPP Unique Id; used by the MPI Interface"; + +static const std::string DOC_MscclppComm = "MSCCLPP Communications Handle"; + NB_MODULE(_py_mscclpp, m) { m.doc() = "Python bindings for MSCCLPP: which is not NCCL"; m.attr("MSCCLPP_UNIQUE_ID_BYTES") = MSCCLPP_UNIQUE_ID_BYTES; nb::class_(m, "MscclppUniqueId") + .def_ro_static("__doc__", &DOC_MscclppUniqueId) .def_static( "from_context", []() { mscclppUniqueId uniqueId; - return maybe(mscclppGetUniqueId(&uniqueId), uniqueId, - "Failed to get MSCCLP Unique Id."); + return maybe( + mscclppGetUniqueId(&uniqueId), + uniqueId, + "Failed to get MSCCLP Unique Id."); }, nb::call_guard()) - .def_static("from_bytes", - [](nb::bytes source) { - if (source.size() != MSCCLPP_UNIQUE_ID_BYTES) { - throw std::invalid_argument(string_format( - "Requires exactly %d bytes; found %d", - MSCCLPP_UNIQUE_ID_BYTES, source.size())); - } + .def_static( + "from_bytes", + [](nb::bytes source) { + if (source.size() != MSCCLPP_UNIQUE_ID_BYTES) { + throw std::invalid_argument(string_format( + "Requires exactly %d bytes; found %d", + MSCCLPP_UNIQUE_ID_BYTES, + source.size())); + } - mscclppUniqueId uniqueId; - std::memcpy(uniqueId.internal, source.c_str(), - sizeof(uniqueId.internal)); - return uniqueId; - }) + mscclppUniqueId uniqueId; + std::memcpy( + uniqueId.internal, source.c_str(), sizeof(uniqueId.internal)); + return uniqueId; + }) .def("bytes", [](mscclppUniqueId id) { return nb::bytes(id.internal, sizeof(id.internal)); }); nb::class_(m, "MscclppComm") + .def_ro_static("__doc__", &DOC_MscclppComm) .def_static( "init_rank_from_address", [](const std::string &address, int rank, int world_size) { MscclppComm comm = {0}; - return maybe(mscclppCommInitRank(&comm.internal, world_size, rank, - address.c_str()), - comm, - "Failed to initialize comms: %s rank=%d world_size=%d", - address, rank, world_size); + comm._is_open = true; + return maybe( + mscclppCommInitRank( + &comm._handle, world_size, rank, address.c_str()), + comm, + "Failed to initialize comms: %s rank=%d world_size=%d", + address, + rank, + world_size); }, - nb::call_guard(), "address"_a, "rank"_a, + nb::call_guard(), + "address"_a, + "rank"_a, "world_size"_a, "Initialize comms given an IP address, rank, and world_size") .def_static( "init_rank_from_id", [](const mscclppUniqueId &id, int rank, int world_size) { MscclppComm comm = {0}; + comm._is_open = true; return maybe( - mscclppCommInitRankFromId(&comm.internal, world_size, id, rank), + mscclppCommInitRankFromId(&comm._handle, world_size, id, rank), comm, "Failed to initialize comms: %02X%s rank=%d world_size=%d", - id.internal, rank, world_size); + id.internal, + rank, + world_size); }, - nb::call_guard(), "id"_a, "rank"_a, + nb::call_guard(), + "id"_a, + "rank"_a, "world_size"_a, "Initialize comms given u UniqueID, rank, and world_size") .def( - "close", + "opened", + [](MscclppComm &comm) { return comm._is_open; }, + "Is this comm object opened?") + .def( + "closed", + [](MscclppComm &comm) { return !comm._is_open; }, + "Is this comm object closed?") + .def( + "rank", [](MscclppComm &comm) { - maybe(mscclppCommDestroy(comm.internal), nb::none(), - "Failed to close comm channel"); - comm.internal = 0; + comm.check_open(); + int rank; + return maybe( + mscclppCommRank(comm._handle, &rank), + rank, + "Failed to retrieve MSCCLPP rank"); }, + nb::call_guard(), + "The rank of this node.") + .def( + "size", + [](MscclppComm &comm) { + comm.check_open(); + int size; + return maybe( + mscclppCommSize(comm._handle, &size), + size, + "Failed to retrieve MSCCLPP world size"); + }, + nb::call_guard(), + "The world size of this node.") + .def( + "connection_setup", + [](MscclppComm &comm) { + comm.check_open(); + return maybe( + mscclppConnectionSetup(comm._handle), + true, + "Failed to settup MSCCLPP connection"); + }, + nb::call_guard(), + "Run connection setup for MSCCLPP.") + .def( + "launch_proxy", + [](MscclppComm &comm) { + comm.check_open(); + return maybe( + mscclppProxyLaunch(comm._handle), + true, + "Failed to launch MSCCLPP proxy"); + }, + nb::call_guard(), + "Start the MSCCLPP proxy.") + .def( + "stop_proxy", + [](MscclppComm &comm) { + comm.check_open(); + return maybe( + mscclppProxyStop(comm._handle), + true, + "Failed to stop MSCCLPP proxy"); + }, + nb::call_guard(), + "Start the MSCCLPP proxy.") + .def( + "close", + &MscclppComm::close, nb::call_guard()) .def( "__del__", - [](MscclppComm &comm) { - maybe(mscclppCommDestroy(comm.internal), nb::none(), - "Failed to close comm channel"); - comm.internal = 0; - }, + &MscclppComm::close, nb::call_guard()); } From 69957baf8d8922235990dd5e77884e2b8e4146b4 Mon Sep 17 00:00:00 2001 From: Crutcher Dunnavant Date: Fri, 24 Mar 2023 20:27:39 +0000 Subject: [PATCH 04/12] update readme, build python package dir --- python/CMakeLists.txt | 19 +++++++ python/README.md | 43 +++++++++------ python/format.sh | 9 ++-- python/mscclpp/__init__.py | 13 ----- .../__pycache__/__init__.cpython-39.pyc | Bin 283 -> 0 bytes .../test_mscclpp.cpython-39-pytest-7.2.0.pyc | Bin 1662 -> 0 bytes ..._py_mscclpp.cpython-39-x86_64-linux-gnu.so | 1 - python/mscclpp/test_mscclpp.py | 49 ------------------ python/setup.sh | 1 - python/test.sh | 2 +- 10 files changed, 53 insertions(+), 84 deletions(-) delete mode 100644 python/mscclpp/__init__.py delete mode 100644 python/mscclpp/__pycache__/__init__.cpython-39.pyc delete mode 100644 python/mscclpp/__pycache__/test_mscclpp.cpython-39-pytest-7.2.0.pyc delete mode 120000 python/mscclpp/_py_mscclpp.cpython-39-x86_64-linux-gnu.so delete mode 100644 python/mscclpp/test_mscclpp.py diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 8889ae1b..3b2bdacc 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -51,3 +51,22 @@ target_link_libraries( mscclpp ) +add_custom_target(build-package ALL DEPENDS _py_mscclpp) +add_custom_command( + TARGET build-package POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_directory + ${CMAKE_CURRENT_SOURCE_DIR}/src/mscclpp + ${CMAKE_CURRENT_BINARY_DIR}/mscclpp) + +add_custom_command( + TARGET build-package POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy + $ + ${CMAKE_CURRENT_BINARY_DIR}/mscclpp/) + +add_custom_command( + TARGET build-package POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy + ${MSCCLPP_DIR}/lib/libmscclpp.so + ${CMAKE_CURRENT_BINARY_DIR}/mscclpp/) + diff --git a/python/README.md b/python/README.md index 961d00ae..a00b200a 100644 --- a/python/README.md +++ b/python/README.md @@ -1,3 +1,30 @@ +# Python bindings + +Test instructions: + * Compile the `libmscclpp.so` library. + * setup a python virtual env + * `pip install -r requirements.txt` + * `./tesh.sh` + +Rough build attemtps +``` +# cd to this directory: + +# setup/enter pyenv environment for python 3.9 + +# install nanabind and the test requirements. +pip install -r requirements.txt + +# setup and build the CMake environments. +# this requires nanobind, installed above. +./setup.sh + +# test the module +pytest build/mscclpp +``` + + +## Installing `gdrcopy` and `mpi` This assumes that some things are built/installed ``` # assumes WORKDIR has: @@ -42,19 +69,3 @@ apt install -y numactl libnuma-dev libnuma1 # if not mpi testing USE_MPI_FOR_TESTS=0 make -j ``` - - -Rough build attemtps -``` -# cd to this directory: - -cmake -S . -B build -cmake --build build --clean-first -v - -# this should contain libmscclpp.so, but does not -ldd build/py_mscclpp.cpython-39-x86_64-linux-gnu.so - -# this will fail due to a missing symbol -( cd build; - LD_LIBRARY_PATH="$PWD/../../build/lib:$LD_LIBRARY_PATH" python -c 'import py_mscclpp' ) -``` diff --git a/python/format.sh b/python/format.sh index 7ee27183..c3b1dbac 100755 --- a/python/format.sh +++ b/python/format.sh @@ -1,6 +1,9 @@ #!/bin/bash -clang-format \ - -style='{"BasedOnStyle": "google", "BinPackParameters": false, "BinPackArguments": false, "AlignAfterOpenBracket": "AlwaysBreak"}' \ - -i src/* +clang-format -style='{ + "BasedOnStyle": "google", + "BinPackParameters": false, + "BinPackArguments": false, + "AlignAfterOpenBracket": "AlwaysBreak" +}' -i src/*.cpp diff --git a/python/mscclpp/__init__.py b/python/mscclpp/__init__.py deleted file mode 100644 index e825b92d..00000000 --- a/python/mscclpp/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -from . import _py_mscclpp - -__all__ = ( - "MscclppUniqueId", - "MSCCLPP_UNIQUE_ID_BYTES", - "MscclppComm", -) - -MscclppUniqueId = _py_mscclpp.MscclppUniqueId -MSCCLPP_UNIQUE_ID_BYTES = _py_mscclpp.MSCCLPP_UNIQUE_ID_BYTES - -MscclppComm = _py_mscclpp.MscclppComm - diff --git a/python/mscclpp/__pycache__/__init__.cpython-39.pyc b/python/mscclpp/__pycache__/__init__.cpython-39.pyc deleted file mode 100644 index 67d2f462357d383f2b6834085c8d1ba367796c37..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 283 zcmYe~<>g`kf@3>mQaBhH7#@Q-$bbpRaRB0CH6W3~kiwY5kjogw$jFew6vdRvn8h5$ z3?x~iSb!vJ6l)4|FoPz`OQ1;%nvA!&;|nU|bBmLca|#MHnQ!s?LYSd>nT4gPo+-D) zeS@8yeF6gFL;XAhLtW!NUE-Z0LtKMzaYI!)=jZ17X|mm7C<3|T7JGbrVopwcd=WE{ zSHuD$Sb@Y!h9V9S8$|rl)i27=FVRoVPf68>TCHDDS(1^T2j|AeXXa&=#K-FuRNmsS Z$<0qG%}KQbIk^~Q0uK`l8yh1V69BEsMaBRC diff --git a/python/mscclpp/__pycache__/test_mscclpp.cpython-39-pytest-7.2.0.pyc b/python/mscclpp/__pycache__/test_mscclpp.cpython-39-pytest-7.2.0.pyc deleted file mode 100644 index 4fd716c340f36b9eb65253eff985cdcfc07b71d3..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1662 zcmZ`(&2A$_5T5S&@p$6cVUbvtKUupRFb8J?LP&s6goz*$Yb_)}VlJ)8bSEA&Ki=-P zvbG2p_R13gl6~B#;KWPxmD9cg5)oBBc5qPWQPp%+S65fpSJlb(cI3eL{f{^Jm%iit ziNfaaK-dRTF);3Mk~=Z6e_FWIAx`1NKB-rNIIt^?xMBZc9Dd<&%H3xUcSSI6{^Z6H zXguzN#$Rc+I5~29!3*&33{&Kr!v$dqKCrPX9+L9ST`t#>R0 z=R&V>*#WJ(56mGE^+-UXD=*jq00Yv3nMdweFk1-bo9j*gg1O9)qU>8@qm?va0M$aUnA;O!Hc08f#qyo6#KJCKFEK q?kLo~fsJFrV~__4Ja%}zJ^Z#PP54P&Tc2X?6>hyxLK@OL5B>q%te)-w diff --git a/python/mscclpp/_py_mscclpp.cpython-39-x86_64-linux-gnu.so b/python/mscclpp/_py_mscclpp.cpython-39-x86_64-linux-gnu.so deleted file mode 120000 index d29b044c..00000000 --- a/python/mscclpp/_py_mscclpp.cpython-39-x86_64-linux-gnu.so +++ /dev/null @@ -1 +0,0 @@ -../build/_py_mscclpp.cpython-39-x86_64-linux-gnu.so \ No newline at end of file diff --git a/python/mscclpp/test_mscclpp.py b/python/mscclpp/test_mscclpp.py deleted file mode 100644 index fec1229e..00000000 --- a/python/mscclpp/test_mscclpp.py +++ /dev/null @@ -1,49 +0,0 @@ -import unittest -import hamcrest - -import mscclpp - -class UniqueIdTest(unittest.TestCase): - def test_no_constructor(self) -> None: - hamcrest.assert_that( - hamcrest.calling(mscclpp.MscclppUniqueId).with_args(), - hamcrest.raises( - TypeError, - "no constructor", - ), - ) - - def test_getUniqueId(self) -> None: - myId = mscclpp.MscclppUniqueId.from_context() - - hamcrest.assert_that( - myId.bytes(), - hamcrest.has_length(mscclpp.MSCCLPP_UNIQUE_ID_BYTES), - ) - - # from_bytes should work - copy = mscclpp.MscclppUniqueId.from_bytes(myId.bytes()) - hamcrest.assert_that( - copy.bytes(), - hamcrest.equal_to(myId.bytes()), - ) - - # bad size - hamcrest.assert_that( - hamcrest.calling(mscclpp.MscclppUniqueId.from_bytes).with_args(b'abc'), - hamcrest.raises( - ValueError, - f"Requires exactly {mscclpp.MSCCLPP_UNIQUE_ID_BYTES} bytes; found 3" - ), - ) - - -class CommsTest(unittest.TestCase): - def _test(self) -> None: - # this hangs forever - comm = mscclpp.MscclppComm.init_rank_from_address( - address="127.0.0.1:50000", - rank=0, - world_size=2, - ) - comm.close() diff --git a/python/setup.sh b/python/setup.sh index a8eea6ed..fe080bbf 100755 --- a/python/setup.sh +++ b/python/setup.sh @@ -3,5 +3,4 @@ set -ex cmake -S . -B build cmake --build build --clean-first -v -ldd build/py_mscclpp.cpython-39-x86_64-linux-gnu.so diff --git a/python/test.sh b/python/test.sh index f675d9ab..32b19bda 100755 --- a/python/test.sh +++ b/python/test.sh @@ -8,4 +8,4 @@ fi cmake --build build -pytest mscclpp +pytest build/mscclpp From e181cca064c1efb0f44c24dc2c33f51f4190001c Mon Sep 17 00:00:00 2001 From: Crutcher Dunnavant Date: Fri, 24 Mar 2023 20:31:43 +0000 Subject: [PATCH 05/12] switch to static linking of nanobind --- python/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 3b2bdacc..f8798f66 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -28,7 +28,7 @@ set(MSCCLPP_DIR ${CMAKE_CURRENT_LIST_DIR}/../build) nanobind_add_module( _py_mscclpp NOSTRIP - NB_SHARED + NB_STATIC src/_py_mscclpp.cpp ) From 57b3c3697532ecba9c4c6ff94b68139e468098ec Mon Sep 17 00:00:00 2001 From: Crutcher Dunnavant Date: Fri, 24 Mar 2023 21:24:00 +0000 Subject: [PATCH 06/12] include left out lib; add enums --- python/src/_py_mscclpp.cpp | 28 ++++++++ python/src/mscclpp/__init__.py | 18 +++++ python/src/mscclpp/test_mscclpp.py | 107 +++++++++++++++++++++++++++++ 3 files changed, 153 insertions(+) create mode 100644 python/src/mscclpp/__init__.py create mode 100644 python/src/mscclpp/test_mscclpp.py diff --git a/python/src/_py_mscclpp.cpp b/python/src/_py_mscclpp.cpp index e485a8a8..32c75566 100644 --- a/python/src/_py_mscclpp.cpp +++ b/python/src/_py_mscclpp.cpp @@ -99,11 +99,39 @@ static const std::string DOC_MscclppUniqueId = static const std::string DOC_MscclppComm = "MSCCLPP Communications Handle"; + NB_MODULE(_py_mscclpp, m) { m.doc() = "Python bindings for MSCCLPP: which is not NCCL"; m.attr("MSCCLPP_UNIQUE_ID_BYTES") = MSCCLPP_UNIQUE_ID_BYTES; + nb::enum_(m, "reduce_op") + .value("sum", mscclppRedOp_t::mscclppSum) + .value("prod", mscclppRedOp_t::mscclppProd) + .value("max", mscclppRedOp_t::mscclppMax) + .value("min", mscclppRedOp_t::mscclppMin) + .value("avg", mscclppRedOp_t::mscclppAvg); + + nb::enum_(m, "dtype") + .value("int8", mscclppDataType_t::mscclppInt8) + .value("char", mscclppDataType_t::mscclppChar) + .value("uint8", mscclppDataType_t::mscclppUint8) + .value("int32", mscclppDataType_t::mscclppInt32) + .value("uint32", mscclppDataType_t::mscclppUint32) + .value("int", mscclppDataType_t::mscclppInt) + .value("int64", mscclppDataType_t::mscclppInt64) + .value("uint64", mscclppDataType_t::mscclppUint64) + .value("float16", mscclppDataType_t::mscclppFloat16) + .value("half", mscclppDataType_t::mscclppHalf) + .value("float32", mscclppDataType_t::mscclppFloat32) + .value("float", mscclppDataType_t::mscclppFloat) + .value("float64", mscclppDataType_t::mscclppFloat64) + .value("double", mscclppDataType_t::mscclppDouble) +#if defined(__CUDA_BF16_TYPES_EXIST__) + .value("bfloat16", mscclppDataType_t::mscclppBfloat16) +#endif + ; + nb::class_(m, "MscclppUniqueId") .def_ro_static("__doc__", &DOC_MscclppUniqueId) .def_static( diff --git a/python/src/mscclpp/__init__.py b/python/src/mscclpp/__init__.py new file mode 100644 index 00000000..8778c097 --- /dev/null +++ b/python/src/mscclpp/__init__.py @@ -0,0 +1,18 @@ +from . import _py_mscclpp + +__all__ = ( + "MscclppUniqueId", + "MSCCLPP_UNIQUE_ID_BYTES", + "MscclppComm", + "dtype", + "reduce_op", +) + +dtype = _py_mscclpp.dtype +reduce_op = _py_mscclpp.reduce_op + +MscclppUniqueId = _py_mscclpp.MscclppUniqueId +MSCCLPP_UNIQUE_ID_BYTES = _py_mscclpp.MSCCLPP_UNIQUE_ID_BYTES + +MscclppComm = _py_mscclpp.MscclppComm + diff --git a/python/src/mscclpp/test_mscclpp.py b/python/src/mscclpp/test_mscclpp.py new file mode 100644 index 00000000..864ae85e --- /dev/null +++ b/python/src/mscclpp/test_mscclpp.py @@ -0,0 +1,107 @@ +import unittest +import hamcrest + +import mscclpp + +class DTypeTest(unittest.TestCase): + def test(self) -> None: + for name, val in [ + ('int8', 0), + ('char', 0), + ('uint8', 1), + ('int32', 2), + ('int', 2), + ('uint32', 3), + ('int64', 4), + ('uint64', 5), + ('float16', 6), + ('half', 6), + ('float32', 7), + ('float', 7), + ('float64', 8), + ('double', 8), + ]: + try: + dtype = getattr(mscclpp.dtype, name) + hamcrest.assert_that( + mscclpp.dtype(val), + hamcrest.equal_to(dtype), + reason=(name, val), + ) + hamcrest.assert_that( + int(mscclpp.dtype(val)), + hamcrest.equal_to(val), + reason=(name, val), + ) + except Exception as e: + raise AssertionError((name, val)) from e + +class ReduceOpTest(unittest.TestCase): + def test(self) -> None: + for name, val in [ + ('sum', 0), + ('prod', 1), + ('max', 2), + ('min', 3), + ('avg', 4), + ]: + try: + dtype = getattr(mscclpp.reduce_op, name) + hamcrest.assert_that( + mscclpp.reduce_op(val), + hamcrest.equal_to(dtype), + reason=(name, val), + ) + hamcrest.assert_that( + int(mscclpp.reduce_op(val)), + hamcrest.equal_to(val), + reason=(name, val), + ) + except Exception as e: + raise AssertionError((name, val)) from e + + +class UniqueIdTest(unittest.TestCase): + def test_no_constructor(self) -> None: + hamcrest.assert_that( + hamcrest.calling(mscclpp.MscclppUniqueId).with_args(), + hamcrest.raises( + TypeError, + "no constructor", + ), + ) + + def test_getUniqueId(self) -> None: + myId = mscclpp.MscclppUniqueId.from_context() + + hamcrest.assert_that( + myId.bytes(), + hamcrest.has_length(mscclpp.MSCCLPP_UNIQUE_ID_BYTES), + ) + + # from_bytes should work + copy = mscclpp.MscclppUniqueId.from_bytes(myId.bytes()) + hamcrest.assert_that( + copy.bytes(), + hamcrest.equal_to(myId.bytes()), + ) + + # bad size + hamcrest.assert_that( + hamcrest.calling(mscclpp.MscclppUniqueId.from_bytes).with_args(b'abc'), + hamcrest.raises( + ValueError, + f"Requires exactly {mscclpp.MSCCLPP_UNIQUE_ID_BYTES} bytes; found 3" + ), + ) + + +class CommsTest(unittest.TestCase): + def _test(self) -> None: + # this hangs forever + comm = mscclpp.MscclppComm.init_rank_from_address( + address="127.0.0.1:50000", + rank=0, + world_size=2, + ) + comm.close() From 8b6e35d5e049b3efbda9042be0f68210b7ee2f65 Mon Sep 17 00:00:00 2001 From: Crutcher Dunnavant Date: Fri, 24 Mar 2023 22:23:20 +0000 Subject: [PATCH 07/12] rebase and fix --- python/src/_py_mscclpp.cpp | 29 +-------------- python/src/mscclpp/__init__.py | 5 --- python/src/mscclpp/test_mscclpp.py | 57 ------------------------------ 3 files changed, 1 insertion(+), 90 deletions(-) diff --git a/python/src/_py_mscclpp.cpp b/python/src/_py_mscclpp.cpp index 32c75566..55c5848f 100644 --- a/python/src/_py_mscclpp.cpp +++ b/python/src/_py_mscclpp.cpp @@ -105,33 +105,6 @@ NB_MODULE(_py_mscclpp, m) { m.attr("MSCCLPP_UNIQUE_ID_BYTES") = MSCCLPP_UNIQUE_ID_BYTES; - nb::enum_(m, "reduce_op") - .value("sum", mscclppRedOp_t::mscclppSum) - .value("prod", mscclppRedOp_t::mscclppProd) - .value("max", mscclppRedOp_t::mscclppMax) - .value("min", mscclppRedOp_t::mscclppMin) - .value("avg", mscclppRedOp_t::mscclppAvg); - - nb::enum_(m, "dtype") - .value("int8", mscclppDataType_t::mscclppInt8) - .value("char", mscclppDataType_t::mscclppChar) - .value("uint8", mscclppDataType_t::mscclppUint8) - .value("int32", mscclppDataType_t::mscclppInt32) - .value("uint32", mscclppDataType_t::mscclppUint32) - .value("int", mscclppDataType_t::mscclppInt) - .value("int64", mscclppDataType_t::mscclppInt64) - .value("uint64", mscclppDataType_t::mscclppUint64) - .value("float16", mscclppDataType_t::mscclppFloat16) - .value("half", mscclppDataType_t::mscclppHalf) - .value("float32", mscclppDataType_t::mscclppFloat32) - .value("float", mscclppDataType_t::mscclppFloat) - .value("float64", mscclppDataType_t::mscclppFloat64) - .value("double", mscclppDataType_t::mscclppDouble) -#if defined(__CUDA_BF16_TYPES_EXIST__) - .value("bfloat16", mscclppDataType_t::mscclppBfloat16) -#endif - ; - nb::class_(m, "MscclppUniqueId") .def_ro_static("__doc__", &DOC_MscclppUniqueId) .def_static( @@ -172,7 +145,7 @@ NB_MODULE(_py_mscclpp, m) { comm._is_open = true; return maybe( mscclppCommInitRank( - &comm._handle, world_size, rank, address.c_str()), + &comm._handle, world_size, address.c_str(), rank), comm, "Failed to initialize comms: %s rank=%d world_size=%d", address, diff --git a/python/src/mscclpp/__init__.py b/python/src/mscclpp/__init__.py index 8778c097..e825b92d 100644 --- a/python/src/mscclpp/__init__.py +++ b/python/src/mscclpp/__init__.py @@ -4,13 +4,8 @@ __all__ = ( "MscclppUniqueId", "MSCCLPP_UNIQUE_ID_BYTES", "MscclppComm", - "dtype", - "reduce_op", ) -dtype = _py_mscclpp.dtype -reduce_op = _py_mscclpp.reduce_op - MscclppUniqueId = _py_mscclpp.MscclppUniqueId MSCCLPP_UNIQUE_ID_BYTES = _py_mscclpp.MSCCLPP_UNIQUE_ID_BYTES diff --git a/python/src/mscclpp/test_mscclpp.py b/python/src/mscclpp/test_mscclpp.py index 864ae85e..e67f2770 100644 --- a/python/src/mscclpp/test_mscclpp.py +++ b/python/src/mscclpp/test_mscclpp.py @@ -3,63 +3,6 @@ import hamcrest import mscclpp -class DTypeTest(unittest.TestCase): - def test(self) -> None: - for name, val in [ - ('int8', 0), - ('char', 0), - ('uint8', 1), - ('int32', 2), - ('int', 2), - ('uint32', 3), - ('int64', 4), - ('uint64', 5), - ('float16', 6), - ('half', 6), - ('float32', 7), - ('float', 7), - ('float64', 8), - ('double', 8), - ]: - try: - dtype = getattr(mscclpp.dtype, name) - hamcrest.assert_that( - mscclpp.dtype(val), - hamcrest.equal_to(dtype), - reason=(name, val), - ) - hamcrest.assert_that( - int(mscclpp.dtype(val)), - hamcrest.equal_to(val), - reason=(name, val), - ) - except Exception as e: - raise AssertionError((name, val)) from e - -class ReduceOpTest(unittest.TestCase): - def test(self) -> None: - for name, val in [ - ('sum', 0), - ('prod', 1), - ('max', 2), - ('min', 3), - ('avg', 4), - ]: - try: - dtype = getattr(mscclpp.reduce_op, name) - hamcrest.assert_that( - mscclpp.reduce_op(val), - hamcrest.equal_to(dtype), - reason=(name, val), - ) - hamcrest.assert_that( - int(mscclpp.reduce_op(val)), - hamcrest.equal_to(val), - reason=(name, val), - ) - except Exception as e: - raise AssertionError((name, val)) from e - class UniqueIdTest(unittest.TestCase): def test_no_constructor(self) -> None: From 3b1abaaad1047308d1eb328b5c6626552fb0ec08 Mon Sep 17 00:00:00 2001 From: Crutcher Dunnavant Date: Fri, 24 Mar 2023 23:45:29 +0000 Subject: [PATCH 08/12] basic init test --- python/src/_py_mscclpp.cpp | 11 ++++++++ python/src/mscclpp/test_mscclpp.py | 45 ++++++++++++++++++++++++------ 2 files changed, 48 insertions(+), 8 deletions(-) diff --git a/python/src/_py_mscclpp.cpp b/python/src/_py_mscclpp.cpp index 55c5848f..095ff2cf 100644 --- a/python/src/_py_mscclpp.cpp +++ b/python/src/_py_mscclpp.cpp @@ -247,5 +247,16 @@ NB_MODULE(_py_mscclpp, m) { .def( "__del__", &MscclppComm::close, + nb::call_guard()) + .def( + "bootstrap_all_gather", + [](MscclppComm &comm, void *data, int size) { + comm.check_open(); + return maybe( + mscclppBootstrapAllGather(comm._handle, data, size), + true, + "Failed to stop MSCCLPP proxy"); + }, nb::call_guard()); + } diff --git a/python/src/mscclpp/test_mscclpp.py b/python/src/mscclpp/test_mscclpp.py index e67f2770..a77707d8 100644 --- a/python/src/mscclpp/test_mscclpp.py +++ b/python/src/mscclpp/test_mscclpp.py @@ -1,3 +1,4 @@ +import concurrent.futures import unittest import hamcrest @@ -38,13 +39,41 @@ class UniqueIdTest(unittest.TestCase): ), ) +def all_gather_task(rank: int, world_size: int) -> None: + comm_options = dict( + address="127.0.0.1:50000", + rank=rank, + world_size=world_size, + ) + print(f'{comm_options=}', flush=True) + + comm = mscclpp.MscclppComm.init_rank_from_address(**comm_options) + + buf = bytearray(world_size) + buf[rank] = rank + + if False: + # crashes, bad call structure.. + comm.bootstrap_all_gather(memoryview(buf), world_size) + hamcrest.assert_that( + buf, + hamcrest.equal_to(b'\000\002'), + ) + + comm.close() + class CommsTest(unittest.TestCase): - def _test(self) -> None: - # this hangs forever - comm = mscclpp.MscclppComm.init_rank_from_address( - address="127.0.0.1:50000", - rank=0, - world_size=2, - ) - comm.close() + def test_all_gather(self) -> None: + world_size = 2 + + tasks: list[concurrent.futures.Future[None]] = [] + + with concurrent.futures.ProcessPoolExecutor(max_workers=world_size) as pool: + for rank in range(world_size): + tasks.append(pool.submit(all_gather_task, rank, world_size)) + + for f in concurrent.futures.as_completed(tasks): + f.result() + + From f929d2eabab5e0357ca2a581a700bed79f498e63 Mon Sep 17 00:00:00 2001 From: Crutcher Dunnavant Date: Sat, 25 Mar 2023 00:41:21 +0000 Subject: [PATCH 09/12] add ci hook --- python/.gitignore | 2 +- python/ci.sh | 24 ++++++++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) create mode 100755 python/ci.sh diff --git a/python/.gitignore b/python/.gitignore index 19bd21ec..1a1abdb2 100644 --- a/python/.gitignore +++ b/python/.gitignore @@ -1,2 +1,2 @@ .*.swp - +.venv/ diff --git a/python/ci.sh b/python/ci.sh new file mode 100755 index 00000000..f3ffb86a --- /dev/null +++ b/python/ci.sh @@ -0,0 +1,24 @@ +#!/bin/bash +# CI hook script. + +set -ex + +# CD to this directory. +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +cd $SCRIPT_DIR + +# clean env +rm -rf .venv build + +# setup a python virtual env +python -m venv .venv + +# activate the virtual env +source .venv/bin/activate + +# install venv deps. +pip install -r requirements.txt + +# run the build and test. +./test.sh + From 95fda5a4ef0725c2f979d824d2c581e9967269c7 Mon Sep 17 00:00:00 2001 From: Crutcher Dunnavant Date: Sat, 25 Mar 2023 00:15:56 -0700 Subject: [PATCH 10/12] ignore ide dirs --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 1739c837..5d5eff88 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ build/ __pycache__ .*.swp +.idea/ From 98e254c5f6c75a773190eb7705e82243b0ef1dd1 Mon Sep 17 00:00:00 2001 From: Crutcher Dunnavant Date: Sat, 25 Mar 2023 01:06:23 -0700 Subject: [PATCH 11/12] readme change --- python/README.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/python/README.md b/python/README.md index a00b200a..73e0f17e 100644 --- a/python/README.md +++ b/python/README.md @@ -36,12 +36,17 @@ uname -r # install +# break /usr/sbin/policy-rc.d so we can install modules +echo '#!/bin/sh +exit 0' > /usr/sbin/policy-rc.d + apt update apt install -y \ build-essential devscripts debhelper check \ libsubunit-dev fakeroot pkg-config dkms \ - nvidia-dkms-525-server \ linux-headers-5.4.0-1090-azure + +apt install -y nvidia-dkms-525-server cd $WORKDIR/gdrcopy From 9eca65283c9626dbedfdbbaca7cc5088181be0db Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Sat, 25 Mar 2023 18:44:20 +0000 Subject: [PATCH 12/12] added cmake requirement -- it needs 3.18 or higher --- python/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/python/README.md b/python/README.md index 73e0f17e..04e90000 100644 --- a/python/README.md +++ b/python/README.md @@ -2,6 +2,7 @@ Test instructions: * Compile the `libmscclpp.so` library. + * Install `cmake` verion >= 3.18 * setup a python virtual env * `pip install -r requirements.txt` * `./tesh.sh`