From 4114d65c601004d13058c8daa7c654c00ab74cd2 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Fri, 7 Jul 2023 17:35:05 +0800 Subject: [PATCH] Documents & minor updates (#119) Co-authored-by: Saeed Maleki Co-authored-by: Binyang Li --- .github/workflows/codeql.yml | 15 +- CMakeLists.txt | 72 +-- README.md | 30 +- cmake/AddClangFormatTargets.cmake | 17 +- docs/Doxyfile | 7 +- docs/quickstart.md | 78 ++- include/mscclpp/concurrency.hpp | 15 +- include/mscclpp/core.hpp | 492 ++++++++++++++---- include/mscclpp/cuda_utils.hpp | 88 +++- include/mscclpp/errors.hpp | 41 +- include/mscclpp/fifo.hpp | 62 ++- include/mscclpp/packet.hpp | 32 +- include/mscclpp/proxy_channel.hpp | 146 +++++- include/mscclpp/semaphore.hpp | 65 ++- include/mscclpp/sm_channel.hpp | 81 ++- src/CMakeLists.txt | 1 + src/connection.cc | 4 +- src/core.cc | 100 ++++ src/proxy.cc | 1 + src/proxy_channel.cc | 4 +- test/CMakeLists.txt | 35 +- test/allgather_test_cpp.cu | 68 +-- test/mp_unit/CMakeLists.txt | 2 +- test/mp_unit/communicator_tests.cu | 8 +- test/mp_unit/ib_tests.cu | 4 +- test/mp_unit/mp_unit_tests.hpp | 6 +- ...hannel_tests.cu => proxy_channel_tests.cu} | 118 ++--- test/mscclpp-test/CMakeLists.txt | 3 +- test/mscclpp-test/allgather_test.cu | 102 ++-- test/mscclpp-test/allreduce_test.cu | 76 +-- test/mscclpp-test/alltoall_test.cu | 32 +- test/mscclpp-test/common.cc | 16 +- test/mscclpp-test/common.hpp | 4 +- 33 files changed, 1314 insertions(+), 511 deletions(-) create mode 100644 src/core.cc rename test/mp_unit/{device_channel_tests.cu => proxy_channel_tests.cu} (66%) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 9841234b..9f46b386 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -13,15 +13,11 @@ jobs: runs-on: 'ubuntu-latest' container: image: ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1 - credentials: - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} permissions: actions: read contents: read security-events: write - packages: read strategy: fail-fast: false @@ -41,13 +37,14 @@ jobs: with: languages: ${{ matrix.language }} - - run: | - echo "Run, Build Application using script" - git config --global --add safe.directory /__w/mscclpp/mscclpp + - name: Install cmake + run: | curl -L https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp - MPI_HOME=/usr/local/mpi /tmp/cmake-3.26.4-linux-x86_64/bin/cmake -DCMAKE_BUILD_TYPE=Release . - make -j + sudo ln -s /tmp/cmake-3.26.4-linux-x86_64/bin/cmake /usr/bin/cmake + + - name: Autobuild + uses: github/codeql-action/autobuild@v2 - name: Perform CodeQL Analysis uses: github/codeql-action/analyze@v2 diff --git a/CMakeLists.txt b/CMakeLists.txt index fbf7b708..1152fe64 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,75 +1,75 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. +set(MSCCLPP_MAJOR "0") +set(MSCCLPP_MINOR "1") +set(MSCCLPP_PATCH "0") + +set(MSCCLPP_SOVERSION ${MSCCLPP_MAJOR}) +set(MSCCLPP_VERSION "${MSCCLPP_MAJOR}.${MSCCLPP_MINOR}.${MSCCLPP_PATCH}") + cmake_minimum_required(VERSION 3.26) project(mscclpp LANGUAGES CUDA CXX) set(CMAKE_CXX_STANDARD 17) set(CMAKE_CUDA_STANDARD 17) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra") - -option(ENABLE_TRACE "Enable tracing" OFF) -option(USE_MPI_FOR_TESTS "Use MPI for tests" ON) -option(USE_NPKIT "Use NPKIT" ON) -option(ALLOW_GDRCOPY "Use GDRCopy, if available" OFF) - -if (CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT) - set (CMAKE_INSTALL_PREFIX "${CMAKE_BINARY_DIR}/install" CACHE PATH "default install path" FORCE) -endif() +set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -Wall,-Wextra") list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake) -find_package(CUDAToolkit REQUIRED) +# clang-format targets +include(${PROJECT_SOURCE_DIR}/cmake/AddClangFormatTargets.cmake) -# Set CUDA flags based on the detected CUDA version +# Options +option(ENABLE_TRACE "Enable tracing" OFF) +option(USE_NPKIT "Use NPKIT" ON) +option(ALLOW_GDRCOPY "Use GDRCopy, if available" OFF) + +# Find CUDAToolkit. Set CUDA flags based on the detected CUDA version +find_package(CUDAToolkit REQUIRED) if(CUDAToolkit_FOUND) if(CUDAToolkit_VERSION_MAJOR LESS 11) message(FATAL_ERROR "CUDA 11 or higher is required but detected ${CUDAToolkit_VERSION}") endif() if(CUDAToolkit_VERSION_MAJOR GREATER_EQUAL 11) - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode arch=compute_80,code=sm_80") + set(CMAKE_CUDA_ARCHITECTURES 80) endif() if(CUDAToolkit_VERSION_MAJOR GREATER_EQUAL 12) - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode arch=compute_90,code=sm_90") + set(CMAKE_CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES} 90) endif() endif() +set(CUDA_LIBRARIES CUDA::cudart CUDA::cuda_driver) +# Find ibverbs and libnuma find_package(IBVerbs REQUIRED) find_package(NUMA REQUIRED) -if(USE_MPI_FOR_TESTS) - find_package(MPI REQUIRED) -endif() + +# Find optional packages if(ALLOW_GDRCOPY) find_package(GDRCopy) endif() -include_directories(${CUDAToolkit_INCLUDE_DIRS}) -include(CTest) -include(FetchContent) -FetchContent_Declare(googletest URL https://github.com/google/googletest/archive/b796f7d44681514f58a683a3a71ff17c94edb0c1.zip) -option(INSTALL_GTEST OFF) -FetchContent_MakeAvailable(googletest) -include(GoogleTest) - -set(CLANG_FORMAT_SOURCE_DIRS include src test) -include(${PROJECT_SOURCE_DIR}/cmake/AddClangFormatTargets.cmake) - +# libmscclpp add_library(mscclpp SHARED) -target_include_directories(mscclpp PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src/include) -set_target_properties(mscclpp PROPERTIES LINKER_LANGUAGE CXX) -target_link_libraries(mscclpp PRIVATE MSCCLPP::ibverbs MSCCLPP::numa CUDA::cudart CUDA::cuda_driver) +target_include_directories(mscclpp + PRIVATE + ${CUDAToolkit_INCLUDE_DIRS} + ${IBVERBS_INCLUDE_DIRS} + ${NUMA_INCLUDE_DIRS} + ${GDRCOPY_INCLUDE_DIRS}) +target_link_libraries(mscclpp PRIVATE ${CUDA_LIBRARIES} ${NUMA_LIBRARIES} ${IBVERBS_LIBRARIES} ${GDRCOPY_LIBRARIES}) +set_target_properties(mscclpp PROPERTIES LINKER_LANGUAGE CXX VERSION ${MSCCLPP_VERSION} SOVERSION ${MSCCLPP_SOVERSION}) if(ENABLE_TRACE) target_compile_definitions(mscclpp PRIVATE ENABLE_TRACE) endif() if(USE_NPKIT) target_compile_definitions(mscclpp PRIVATE ENABLE_NPKIT) endif() -if(ALLOW_GDRCOPY AND GDRCOPY_FOUND) - target_compile_definitions(mscclpp PRIVATE MSCCLPP_USE_GDRCOPY) - target_link_libraries(mscclpp PRIVATE MSCCLPP::gdrcopy) -endif() +add_subdirectory(include) +add_subdirectory(src) +install(TARGETS mscclpp LIBRARY DESTINATION lib) -add_subdirectory(include) # This adds the public headers to install with mscclpp -add_subdirectory(src) # This adds the sources to the mscclpp target +# Tests add_subdirectory(test) diff --git a/README.md b/README.md index db7e4619..d18c4f76 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ See [Quick Start](docs/quickstart.md) to quickly get started. See the latest performance evaluation on Azure [NDmv4](docs/performance-ndmv4.md). -Build our Doxygen document by running `doxygen` in [`docs/`](docs/) directory. +Build our Doxygen document by running `doxygen` in [`docs/`](docs/) directory. Run `python3 -m http.server ` in `docs/doxygen/html/` directory to serve the generated HTML files. ## Overview @@ -97,22 +97,22 @@ __global__ void gpuKernel() { ... } -// Host-side custom channel service -class CustomChannelService { +// Host-side custom proxy service +class CustomProxyService { private: mscclpp::Proxy proxy_; public: - CustomChannelService() : proxy_([&](mscclpp::ProxyTrigger trigger) { - // Custom trigger handler - if (trigger.fst == 1) { - // Handle request "1" - } else if (trigger.fst == 2) { - // Handle request "2" - } else if (trigger.fst == 0xdeadbeef) { - // Handle request "0xdeadbeef" - } - }, - [&]() { /* Empty proxy initializer */ }) {} + CustomProxyService() : proxy_([&](mscclpp::ProxyTrigger trigger) { + // Custom trigger handler + if (trigger.fst == 1) { + // Handle request "1" + } else if (trigger.fst == 2) { + // Handle request "2" + } else if (trigger.fst == 0xdeadbeef) { + // Handle request "0xdeadbeef" + } + }, + [&]() { /* Empty proxy initializer */ }) {} void startProxy() { proxy_.start(); } void stopProxy() { proxy_.stop(); } }; @@ -139,7 +139,7 @@ MSCCL++ is under active development and a part of its features will be added in ### MSCCL++ v0.2 (Latest Release) * Basic communication functionalities and new interfaces - GPU-side communication interfaces - - Host-side helpers: bootstrap, communicator, and channel service (proxy) + - Host-side helpers: bootstrap, communicator, and proxy - Supports both NVLink and InfiniBand - Supports both in-SM copy and DMA/RDMA * Communication performance optimization diff --git a/cmake/AddClangFormatTargets.cmake b/cmake/AddClangFormatTargets.cmake index 73511b97..ff56fc04 100644 --- a/cmake/AddClangFormatTargets.cmake +++ b/cmake/AddClangFormatTargets.cmake @@ -6,16 +6,13 @@ find_program(CLANG_FORMAT clang-format) if(CLANG_FORMAT) message(STATUS "Found clang-format: ${CLANG_FORMAT}") - set(CLANG_FORMAT_FILE_TYPES *.h *.hpp *.c *.cc *.cpp *.cu) - # Produce combinations of source directories and file types - foreach(SOURCE_DIR ${CLANG_FORMAT_SOURCE_DIRS}) - foreach(FILE_TYPE ${CLANG_FORMAT_FILE_TYPES}) - list(APPEND CLANG_FORMAT_SOURCE_PATTERNS ${SOURCE_DIR}/${FILE_TYPE}) - endforeach() - endforeach() - file(GLOB_RECURSE CLANG_FORMAT_SOURCES ${CLANG_FORMAT_SOURCE_PATTERNS}) - add_custom_target(check-format ALL COMMAND ${CLANG_FORMAT} -style=file --dry-run ${CLANG_FORMAT_SOURCES}) - add_custom_target(format COMMAND ${CLANG_FORMAT} -style=file -i ${CLANG_FORMAT_SOURCES}) + set(FIND_DIRS ${PROJECT_SOURCE_DIR}/src ${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/test) + add_custom_target(check-format ALL + COMMAND ${CLANG_FORMAT} -style=file --dry-run `find ${FIND_DIRS} -type f -name *.h -o -name *.hpp -o -name *.c -o -name *.cc -o -name *.cpp -o -name *.cu` + ) + add_custom_target(format + COMMAND ${CLANG_FORMAT} -style=file -i `find ${FIND_DIRS} -type f -name *.h -o -name *.hpp -o -name *.c -o -name *.cc -o -name *.cpp -o -name *.cu` + ) else() message(STATUS "clang-format not found.") endif() diff --git a/docs/Doxyfile b/docs/Doxyfile index b885a01e..0fa68bf2 100644 --- a/docs/Doxyfile +++ b/docs/Doxyfile @@ -829,7 +829,10 @@ WARN_LOGFILE = # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING # Note: If this tag is empty the current directory is searched. -INPUT = ../include/mscclpp +INPUT = ../include/mscclpp \ + ../README.md \ + quickstart.md \ + performance-ndmv4.md # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses @@ -1027,7 +1030,7 @@ FILTER_SOURCE_PATTERNS = # (index.html). This can be useful if you have a project on for instance GitHub # and want to reuse the introduction page also for the doxygen output. -USE_MDFILE_AS_MAINPAGE = +USE_MDFILE_AS_MAINPAGE = ../README.md #--------------------------------------------------------------------------- # Configuration options related to source browsing diff --git a/docs/quickstart.md b/docs/quickstart.md index d93cd5e5..37d5ed77 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -15,24 +15,84 @@ ## Build from Source -``` +CMake 3.26 or later is required. + +```bash $ git clone https://github.com/microsoft/mscclpp.git $ mkdir -p mscclpp/build && cd mscclpp/build -$ cmake .. +$ cmake -DCMAKE_BUILD_TYPE=Release .. $ make -j ``` ## Install from Source -``` -# Install the generated headers and binaries to /usr/local -$ cmake --install . --prefix /usr/local +```bash +# Install the generated headers and binaries to /usr/local/mscclpp +$ cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr/local/mscclpp .. +$ make -j mscclpp +$ sudo make install/fast ``` -## Install from Package +## Docker Images -TBU +Our base image installs all prerequisites for MSCCL++. -## (Optional) Unit Tests +```bash +$ docker pull ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1 +``` -TBU +## Unit Tests + +`unit_tests` require one GPU on the system. It only tests operation of basic components. + +```bash +$ make -j unit_tests +$ ./test/unit_tests +``` + +For thorough testing of MSCCL++ features, we need to use `mp_unit_tests` that require at least two GPUs on the system. `mp_unit_tests` also requires MPI to be installed on the system. For example, the following commands run `mp_unit_tests` with two processes (two GPUs). The number of GPUs can be changed by changing the number of processes. + +```bash +$ make -j mp_unit_tests +$ mpirun -np 2 ./test/mp_unit_tests +``` + +To run `mp_unit_tests` with more than two nodes, you need to specify the `-ip_port` argument that is accessible from all nodes. For example: + +```bash +$ mpirun -np 16 -npernode 8 -hostfile hostfile ./test/mp_unit_tests -ip_port 10.0.0.5:50000 +``` + +## mscclpp-test + +mscclpp-test is a set of performance benchmarks for MSCCL++. It requires MPI to be installed on the system. + +```bash +$ make -j sendrecv_test_perf allgather_test_perf allreduce_test_perf alltoall_test_perf +``` + +For example, the following command runs the AllReduce benchmark with 8 GPUs starting from 3MB to 48MB messages, by doubling the message size in between. + +```bash +$ mpirun -np 8 ./test/mscclpp-test/allreduce_test_perf -b 3m -e 48m -G 100 -n 100 -w 20 -f 2 -k 4 +``` + +Check the help message for more details. + +```bash +$ ./test/mscclpp-test/allreduce_test_perf --help +USAGE: allreduce_test_perf + [-b,--minbytes ] + [-e,--maxbytes ] + [-i,--stepbytes ] + [-f,--stepfactor ] + [-n,--iters ] + [-w,--warmup_iters ] + [-c,--check <0/1>] + [-T,--timeout