mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-04-20 06:49:29 +00:00
v0.3.0 (#171)
This commit is contained in:
10
.github/workflows/codeql.yml
vendored
10
.github/workflows/codeql.yml
vendored
@@ -12,7 +12,7 @@ jobs:
|
||||
name: Analyze
|
||||
runs-on: 'ubuntu-latest'
|
||||
container:
|
||||
image: ghcr.io/microsoft/mscclpp/mscclpp:base-${{ matrix.cuda-version }}
|
||||
image: ghcr.io/microsoft/mscclpp/mscclpp:dev-${{ matrix.cuda-version }}
|
||||
|
||||
permissions:
|
||||
actions: read
|
||||
@@ -27,7 +27,7 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Check disk space
|
||||
run: |
|
||||
@@ -38,12 +38,6 @@ jobs:
|
||||
with:
|
||||
languages: ${{ matrix.language }}
|
||||
|
||||
- name: Install cmake
|
||||
run: |
|
||||
curl -L https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz
|
||||
tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp
|
||||
sudo ln -s /tmp/cmake-3.26.4-linux-x86_64/bin/cmake /usr/bin/cmake
|
||||
|
||||
- name: Dubious ownership exception
|
||||
run: |
|
||||
git config --global --add safe.directory /__w/mscclpp/mscclpp
|
||||
|
||||
15
.github/workflows/integration-test-backup.yml
vendored
15
.github/workflows/integration-test-backup.yml
vendored
@@ -4,7 +4,7 @@ on: workflow_dispatch
|
||||
|
||||
jobs:
|
||||
IntegrationTest:
|
||||
runs-on: self-hosted
|
||||
runs-on: [ self-hosted, A100 ]
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
@@ -13,22 +13,17 @@ jobs:
|
||||
cuda: [ cuda11.8, cuda12.1 ]
|
||||
|
||||
container:
|
||||
image: "ghcr.io/microsoft/mscclpp/mscclpp:base-${{ matrix.cuda }}"
|
||||
image: "ghcr.io/microsoft/mscclpp/mscclpp:dev-${{ matrix.cuda }}"
|
||||
options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Install CMake
|
||||
run: |
|
||||
curl -L https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz
|
||||
tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
mkdir build && cd build
|
||||
MPI_HOME=/usr/local/mpi /tmp/cmake-3.26.4-linux-x86_64/bin/cmake -DCMAKE_BUILD_TYPE=Release ..
|
||||
MPI_HOME=/usr/local/mpi cmake -DCMAKE_BUILD_TYPE=Release ..
|
||||
make -j
|
||||
|
||||
- name: Lock GPU clock frequency
|
||||
@@ -41,7 +36,6 @@ jobs:
|
||||
- name: Run mscclpp AllGather test
|
||||
run: |
|
||||
set -e
|
||||
export PATH=/usr/local/mpi/bin:$PATH
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
|
||||
@@ -50,13 +44,11 @@ jobs:
|
||||
- name: Run mscclpp SendRecv test
|
||||
run: |
|
||||
set -e
|
||||
export PATH=/usr/local/mpi/bin:$PATH
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl
|
||||
|
||||
- name: Run mscclpp AllReduce test
|
||||
run: |
|
||||
set -e
|
||||
export PATH=/usr/local/mpi/bin:$PATH
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
|
||||
@@ -68,7 +60,6 @@ jobs:
|
||||
- name: Run mscclpp AllToAll test
|
||||
run: |
|
||||
set -e
|
||||
export PATH=/usr/local/mpi/bin:$PATH
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl
|
||||
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
|
||||
|
||||
|
||||
12
.github/workflows/lint.yml
vendored
12
.github/workflows/lint.yml
vendored
@@ -11,7 +11,7 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Check out Git repository
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Install ClangFormat
|
||||
run: |
|
||||
@@ -28,25 +28,25 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Check out Git repository
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: 3.8
|
||||
python-version: 3
|
||||
|
||||
- name: Install Python dependencies
|
||||
run: python3.8 -m pip install black
|
||||
run: python3 -m pip install black
|
||||
|
||||
- name: Run black
|
||||
run: python3.8 -m black --check --config pyproject.toml .
|
||||
run: python3 -m black --check --config pyproject.toml .
|
||||
|
||||
spelling:
|
||||
runs-on: ubuntu-20.04
|
||||
|
||||
steps:
|
||||
- name: Check out Git repository
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Download misspell
|
||||
run: |
|
||||
|
||||
19
.github/workflows/ut-backup.yml
vendored
19
.github/workflows/ut-backup.yml
vendored
@@ -4,7 +4,7 @@ on: workflow_dispatch
|
||||
|
||||
jobs:
|
||||
UnitTest:
|
||||
runs-on: self-hosted
|
||||
runs-on: [ self-hosted, A100 ]
|
||||
defaults:
|
||||
run:
|
||||
shell: bash
|
||||
@@ -14,7 +14,7 @@ jobs:
|
||||
cuda: [ cuda11.8, cuda12.1 ]
|
||||
|
||||
container:
|
||||
image: "ghcr.io/microsoft/mscclpp/mscclpp:base-${{ matrix.cuda }}"
|
||||
image: "ghcr.io/microsoft/mscclpp/mscclpp:dev-${{ matrix.cuda }}"
|
||||
options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1
|
||||
|
||||
steps:
|
||||
@@ -23,10 +23,8 @@ jobs:
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
curl -L -C- https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz
|
||||
tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp
|
||||
mkdir build && cd build
|
||||
MPI_HOME=/usr/local/mpi /tmp/cmake-3.26.4-linux-x86_64/bin/cmake -DCMAKE_BUILD_TYPE=Release ..
|
||||
MPI_HOME=/usr/local/mpi cmake -DCMAKE_BUILD_TYPE=Release ..
|
||||
make -j
|
||||
working-directory: ${{ github.workspace }}
|
||||
|
||||
@@ -36,31 +34,20 @@ jobs:
|
||||
for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
|
||||
sudo nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i
|
||||
done
|
||||
working-directory: ${{ github.workspace }}
|
||||
|
||||
- name: UnitTests
|
||||
run: |
|
||||
./build/test/unit_tests
|
||||
working-directory: ${{ github.workspace }}
|
||||
|
||||
- name: MpUnitTests
|
||||
run: |
|
||||
set -e
|
||||
export PATH=/usr/local/mpi/bin:$PATH
|
||||
mpirun --allow-run-as-root -tag-output -np 2 ./build/test/mp_unit_tests
|
||||
mpirun --allow-run-as-root -tag-output -np 4 ./build/test/mp_unit_tests
|
||||
mpirun --allow-run-as-root -tag-output -np 8 ./build/test/mp_unit_tests
|
||||
working-directory: ${{ github.workspace }}
|
||||
|
||||
- name: PyTests
|
||||
run: |
|
||||
set -e
|
||||
export PATH=/usr/local/mpi/bin:$PATH
|
||||
cd build && make pylib-copy
|
||||
if [[ '${{ matrix.cuda }}' == 'cuda11'* ]]; then
|
||||
python3 -m pip install -r ../python/test/requirements_cu11.txt
|
||||
else
|
||||
python3 -m pip install -r ../python/test/requirements_cu12.txt
|
||||
fi
|
||||
mpirun --allow-run-as-root -tag-output -np 8 $(which pytest) ../python/test/test_mscclpp.py -x
|
||||
working-directory: ${{ github.workspace }}
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
# Licensed under the MIT license.
|
||||
|
||||
set(MSCCLPP_MAJOR "0")
|
||||
set(MSCCLPP_MINOR "2")
|
||||
set(MSCCLPP_MINOR "3")
|
||||
set(MSCCLPP_PATCH "0")
|
||||
|
||||
set(MSCCLPP_SOVERSION ${MSCCLPP_MAJOR})
|
||||
|
||||
36
README.md
36
README.md
@@ -18,9 +18,9 @@ MSCCL++ is a development kit for implementing highly optimized distributed GPU a
|
||||
|
||||
* **Runtime Performance Optimization for Dynamic Workload.** As we can easily implement flexible communication logics, we can optimize communication performance even during runtime. For example, we can implement the system to automatically choose different communication paths or different collective communication algorithms depending on the dynamic workload at runtime.
|
||||
|
||||
## Key Features (v0.2)
|
||||
## Key Features (v0.3)
|
||||
|
||||
MSCCL++ v0.2 supports the following features.
|
||||
MSCCL++ v0.3 supports the following features.
|
||||
|
||||
### In-Kernel Communication Interfaces
|
||||
|
||||
@@ -124,31 +124,15 @@ Customized proxies can be used for conducting a series of pre-defined data trans
|
||||
|
||||
Most of key components of MSCCL++ are designed to be easily customized. This enables MSCCL++ to easily adopt a new software / hardware technology and lets users implement algorithms optimized for their own use cases.
|
||||
|
||||
## Status & Roadmap
|
||||
### New in MSCCL++ v0.3 (Latest Release)
|
||||
* Updated interfaces
|
||||
* Add Python bindings and interfaces
|
||||
* Add Python unit tests
|
||||
* Add more configurable parameters
|
||||
* Add a new single-node AllReduce kernel
|
||||
* Fix bugs
|
||||
|
||||
MSCCL++ is under active development and a part of its features will be added in a future release. The following describes key features of each version.
|
||||
|
||||
### MSCCL++ v0.4 (TBU)
|
||||
* Automatic task scheduler
|
||||
* Dynamic performance tuning
|
||||
|
||||
### MSCCL++ v0.3 (TBU)
|
||||
* Tile-based communication: efficient transport of 2D data patches (tiles)
|
||||
* GPU computation interfaces
|
||||
|
||||
### MSCCL++ v0.2 (Latest Release)
|
||||
* Basic communication functionalities and new interfaces
|
||||
- GPU-side communication interfaces
|
||||
- Host-side helpers: bootstrap, communicator, and proxy
|
||||
- Supports both NVLink and InfiniBand
|
||||
- Supports both in-SM copy and DMA/RDMA
|
||||
* Communication performance optimization
|
||||
- Example code outperforms NCCL/MSCCL AllGather/AllReduce/AllToAll
|
||||
* Development pipeline
|
||||
* Documentation
|
||||
|
||||
### MSCCL++ v0.1
|
||||
* Proof-of-concept, preliminary interfaces
|
||||
See details from https://github.com/microsoft/mscclpp/issues/89.
|
||||
|
||||
## Contributing
|
||||
|
||||
|
||||
@@ -5,7 +5,10 @@ LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
RUN apt-get update && \
|
||||
RUN rm -rf /opt/nvidia
|
||||
|
||||
RUN apt-get clean && \
|
||||
apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
ca-certificates \
|
||||
@@ -47,8 +50,10 @@ RUN cd /tmp && \
|
||||
cd .. && \
|
||||
rm -rf /tmp/openmpi-${OPENMPI_VERSION}*
|
||||
|
||||
ENV PATH="${PATH}:/usr/local/mpi/bin" \
|
||||
LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/mpi/lib:/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64"
|
||||
ENV PATH="/usr/local/mpi/bin:${PATH}" \
|
||||
LD_LIBRARY_PATH="/usr/local/mpi/lib:/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64:${LD_LIBRARY_PATH}"
|
||||
|
||||
RUN echo PATH="${PATH}" > /etc/environment && \
|
||||
echo LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" >> /etc/environment
|
||||
|
||||
ENTRYPOINT []
|
||||
|
||||
28
docker/dev-cuda11.8.dockerfile
Normal file
28
docker/dev-cuda11.8.dockerfile
Normal file
@@ -0,0 +1,28 @@
|
||||
FROM ghcr.io/microsoft/mscclpp/mscclpp:base-cuda11.8
|
||||
|
||||
LABEL maintainer="MSCCL++"
|
||||
LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp
|
||||
|
||||
ENV MSCCLPP_SRC_DIR="/tmp/mscclpp" \
|
||||
CMAKE_VERSION="3.26.4"
|
||||
|
||||
ADD . ${MSCCLPP_SRC_DIR}
|
||||
WORKDIR ${MSCCLPP_SRC_DIR}
|
||||
|
||||
# Install cmake 3.26.4
|
||||
ENV CMAKE_HOME="/tmp/cmake-${CMAKE_VERSION}-linux-x86_64" \
|
||||
CMAKE_URL="https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz"
|
||||
RUN curl -L ${CMAKE_URL} -o ${CMAKE_HOME}.tar.gz && \
|
||||
tar xzf ${CMAKE_HOME}.tar.gz -C /usr/local && \
|
||||
rm -rf ${CMAKE_HOME}.tar.gz
|
||||
ENV PATH="/usr/local/cmake-${CMAKE_VERSION}-linux-x86_64/bin:${PATH}"
|
||||
|
||||
# Install pytest & dependencies
|
||||
RUN python3 -m pip install --no-cache-dir -r python/test/requirements_cu11.txt
|
||||
|
||||
# Set PATH
|
||||
RUN echo PATH="${PATH}" > /etc/environment
|
||||
|
||||
# Cleanup
|
||||
WORKDIR /
|
||||
RUN rm -rf ${MSCCLPP_SRC_DIR}
|
||||
27
docker/dev-cuda12.1.dockerfile
Normal file
27
docker/dev-cuda12.1.dockerfile
Normal file
@@ -0,0 +1,27 @@
|
||||
FROM ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1
|
||||
|
||||
LABEL maintainer="MSCCL++"
|
||||
LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp
|
||||
|
||||
ENV MSCCLPP_SRC_DIR="/tmp/mscclpp" \
|
||||
CMAKE_VERSION="3.26.4"
|
||||
|
||||
ADD . ${MSCCLPP_SRC_DIR}
|
||||
WORKDIR ${MSCCLPP_SRC_DIR}
|
||||
|
||||
# Install cmake 3.26.4
|
||||
ENV CMAKE_HOME="/tmp/cmake-${CMAKE_VERSION}-linux-x86_64" \
|
||||
CMAKE_URL="https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz"
|
||||
RUN curl -L ${CMAKE_URL} -o ${CMAKE_HOME}.tar.gz && \
|
||||
tar xzf ${CMAKE_HOME}.tar.gz -C /usr/local
|
||||
ENV PATH="/usr/local/cmake-${CMAKE_VERSION}-linux-x86_64/bin:${PATH}"
|
||||
|
||||
# Install pytest & dependencies
|
||||
RUN python3 -m pip install --no-cache-dir -r python/test/requirements_cu12.txt
|
||||
|
||||
# Set PATH
|
||||
RUN echo PATH="${PATH}" > /etc/environment
|
||||
|
||||
# Cleanup
|
||||
WORKDIR /
|
||||
RUN rm -rf ${MSCCLPP_SRC_DIR}
|
||||
@@ -5,7 +5,7 @@
|
||||
#define MSCCLPP_CORE_HPP_
|
||||
|
||||
#define MSCCLPP_MAJOR 0
|
||||
#define MSCCLPP_MINOR 2
|
||||
#define MSCCLPP_MINOR 3
|
||||
#define MSCCLPP_PATCH 0
|
||||
#define MSCCLPP_VERSION (MSCCLPP_MAJOR * 10000 + MSCCLPP_MINOR * 100 + MSCCLPP_PATCH)
|
||||
|
||||
@@ -24,6 +24,9 @@ namespace mscclpp {
|
||||
/// Unique ID for a process. This is a MSCCLPP_UNIQUE_ID_BYTES byte array that uniquely identifies a process.
|
||||
using UniqueId = std::array<uint8_t, MSCCLPP_UNIQUE_ID_BYTES>;
|
||||
|
||||
/// Return a version string.
|
||||
std::string version();
|
||||
|
||||
/// Base class for bootstraps.
|
||||
class Bootstrap {
|
||||
public:
|
||||
|
||||
@@ -7,7 +7,7 @@ build-backend = "scikit_build_core.build"
|
||||
|
||||
[project]
|
||||
name = "mscclpp"
|
||||
version = "0.2.0"
|
||||
version = "0.3.0"
|
||||
|
||||
[tool.scikit-build]
|
||||
cmake.minimum-version = "3.25.0"
|
||||
|
||||
@@ -7,11 +7,10 @@ add_subdirectory(test)
|
||||
add_custom_target(pylib-copy)
|
||||
add_custom_command(TARGET pylib-copy POST_BUILD
|
||||
COMMAND ${CMAKE_COMMAND} -E copy_if_different
|
||||
${CMAKE_CURRENT_BINARY_DIR}/mscclpp/_mscclpp.cpython-38-x86_64-linux-gnu.so
|
||||
${CMAKE_CURRENT_BINARY_DIR}/mscclpp/_mscclpp.*.so
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/mscclpp
|
||||
COMMAND ${CMAKE_COMMAND} -E copy_if_different
|
||||
${CMAKE_CURRENT_BINARY_DIR}/test/_ext.cpython-38-x86_64-linux-gnu.so
|
||||
${CMAKE_CURRENT_BINARY_DIR}/test/_ext.*.so
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/test/_cpp
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "Copy python libraries"
|
||||
)
|
||||
|
||||
|
||||
@@ -18,8 +18,11 @@ from ._mscclpp import (
|
||||
TcpBootstrap,
|
||||
Transport,
|
||||
TransportFlags,
|
||||
version,
|
||||
)
|
||||
|
||||
__version__ = version()
|
||||
|
||||
|
||||
def get_include():
|
||||
"""Return the directory that contains the MSCCL++ headers."""
|
||||
|
||||
@@ -29,6 +29,8 @@ void def_nonblocking_future(nb::handle& m, const std::string& typestr) {
|
||||
}
|
||||
|
||||
void register_core(nb::module_& m) {
|
||||
m.def("version", &version);
|
||||
|
||||
nb::class_<Bootstrap>(m, "Bootstrap")
|
||||
.def("get_rank", &Bootstrap::getRank)
|
||||
.def("get_n_ranks", &Bootstrap::getNranks)
|
||||
|
||||
@@ -127,10 +127,7 @@ class MscclppGroup:
|
||||
channels = {}
|
||||
for rank in connections:
|
||||
channels[rank] = SmChannel(
|
||||
semaphores[rank],
|
||||
registered_memories[rank],
|
||||
tensor.data.ptr,
|
||||
packetTensor.data.ptr,
|
||||
semaphores[rank], registered_memories[rank], tensor.data.ptr, packetTensor.data.ptr
|
||||
)
|
||||
return channels
|
||||
|
||||
@@ -148,8 +145,6 @@ class MscclppGroup:
|
||||
channels = {}
|
||||
for rank in semaphores:
|
||||
channels[rank] = SimpleProxyChannel(
|
||||
proxy_service.proxy_channel(semaphore_ids[rank]),
|
||||
memory_ids[rank],
|
||||
memory_ids[self.my_rank],
|
||||
proxy_service.proxy_channel(semaphore_ids[rank]), memory_ids[rank], memory_ids[self.my_rank]
|
||||
)
|
||||
return channels
|
||||
|
||||
@@ -9,14 +9,7 @@ import numpy as np
|
||||
import netifaces as ni
|
||||
import pytest
|
||||
|
||||
from mscclpp import (
|
||||
Fifo,
|
||||
Host2DeviceSemaphore,
|
||||
Host2HostSemaphore,
|
||||
ProxyService,
|
||||
SmDevice2DeviceSemaphore,
|
||||
Transport,
|
||||
)
|
||||
from mscclpp import Fifo, Host2DeviceSemaphore, Host2HostSemaphore, ProxyService, SmDevice2DeviceSemaphore, Transport
|
||||
from ._cpp import _ext
|
||||
from .mscclpp_group import MscclppGroup
|
||||
from .mscclpp_mpi import MpiGroup, parametrize_mpi_groups, mpi_group
|
||||
@@ -61,11 +54,7 @@ def test_group_with_ip(mpi_group: MpiGroup, ifIpPortTrio: str):
|
||||
for rank in range(group.nranks):
|
||||
if rank == group.my_rank:
|
||||
continue
|
||||
group.send(
|
||||
memory[(nelemPerRank * group.my_rank) : (nelemPerRank * (group.my_rank + 1))],
|
||||
rank,
|
||||
0,
|
||||
)
|
||||
group.send(memory[(nelemPerRank * group.my_rank) : (nelemPerRank * (group.my_rank + 1))], rank, 0)
|
||||
for rank in range(group.nranks):
|
||||
if rank == group.my_rank:
|
||||
continue
|
||||
@@ -207,43 +196,31 @@ class MscclppKernel:
|
||||
):
|
||||
if test_name == "h2d_semaphore":
|
||||
self._kernel = KernelBuilder(
|
||||
file="h2d_semaphore_test.cu",
|
||||
kernel_name="h2d_semaphore",
|
||||
file="h2d_semaphore_test.cu", kernel_name="h2d_semaphore"
|
||||
).get_compiled_kernel()
|
||||
self.nblocks = 1
|
||||
self.nthreads = nranks
|
||||
elif test_name == "d2d_semaphore":
|
||||
self._kernel = KernelBuilder(
|
||||
file="d2d_semaphore_test.cu",
|
||||
kernel_name="d2d_semaphore",
|
||||
file="d2d_semaphore_test.cu", kernel_name="d2d_semaphore"
|
||||
).get_compiled_kernel()
|
||||
self.nblocks = 1
|
||||
self.nthreads = nranks
|
||||
elif test_name == "sm_channel":
|
||||
self._kernel = KernelBuilder(
|
||||
file="sm_channel_test.cu",
|
||||
kernel_name="sm_channel",
|
||||
).get_compiled_kernel()
|
||||
self._kernel = KernelBuilder(file="sm_channel_test.cu", kernel_name="sm_channel").get_compiled_kernel()
|
||||
self.nblocks = nranks
|
||||
self.nthreads = 1024
|
||||
elif test_name == "fifo":
|
||||
self._kernel = KernelBuilder(
|
||||
file="fifo_test.cu",
|
||||
kernel_name="fifo",
|
||||
).get_compiled_kernel()
|
||||
self._kernel = KernelBuilder(file="fifo_test.cu", kernel_name="fifo").get_compiled_kernel()
|
||||
self.nblocks = 1
|
||||
self.nthreads = 1
|
||||
elif test_name == "proxy":
|
||||
self._kernel = KernelBuilder(
|
||||
file="proxy_test.cu",
|
||||
kernel_name="proxy",
|
||||
).get_compiled_kernel()
|
||||
self._kernel = KernelBuilder(file="proxy_test.cu", kernel_name="proxy").get_compiled_kernel()
|
||||
self.nblocks = 1
|
||||
self.nthreads = nranks
|
||||
elif test_name == "simple_proxy_channel":
|
||||
self._kernel = KernelBuilder(
|
||||
file="simple_proxy_channel_test.cu",
|
||||
kernel_name="simple_proxy_channel",
|
||||
file="simple_proxy_channel_test.cu", kernel_name="simple_proxy_channel"
|
||||
).get_compiled_kernel()
|
||||
self.nblocks = 1
|
||||
self.nthreads = 1024
|
||||
@@ -364,17 +341,10 @@ def test_fifo(
|
||||
@parametrize_mpi_groups(2, 4, 8, 16)
|
||||
@pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20]])
|
||||
@pytest.mark.parametrize("transport", ["IB", "NVLink"])
|
||||
def test_proxy(
|
||||
mpi_group: MpiGroup,
|
||||
nelem: int,
|
||||
transport: str,
|
||||
):
|
||||
def test_proxy(mpi_group: MpiGroup, nelem: int, transport: str):
|
||||
group, connections = create_and_connect(mpi_group, transport)
|
||||
|
||||
memory = cp.zeros(
|
||||
nelem,
|
||||
dtype=cp.int32,
|
||||
)
|
||||
memory = cp.zeros(nelem, dtype=cp.int32)
|
||||
nelemPerRank = nelem // group.nranks
|
||||
nelemPerRank * memory.itemsize
|
||||
memory[(nelemPerRank * group.my_rank) : (nelemPerRank * (group.my_rank + 1))] = group.my_rank + 1
|
||||
@@ -401,23 +371,12 @@ def test_proxy(
|
||||
|
||||
list_reg_mem.append(all_reg_memories[rank])
|
||||
|
||||
proxy = _ext.MyProxyService(
|
||||
group.my_rank,
|
||||
group.nranks,
|
||||
nelem * memory.itemsize,
|
||||
list_conn,
|
||||
list_reg_mem,
|
||||
list_sem,
|
||||
)
|
||||
proxy = _ext.MyProxyService(group.my_rank, group.nranks, nelem * memory.itemsize, list_conn, list_reg_mem, list_sem)
|
||||
|
||||
fifo_device_handle = proxy.fifo_device_handle()
|
||||
|
||||
kernel = MscclppKernel(
|
||||
"proxy",
|
||||
my_rank=group.my_rank,
|
||||
nranks=group.nranks,
|
||||
semaphore_or_channels=list_sem,
|
||||
fifo=fifo_device_handle,
|
||||
"proxy", my_rank=group.my_rank, nranks=group.nranks, semaphore_or_channels=list_sem, fifo=fifo_device_handle
|
||||
)
|
||||
proxy.start()
|
||||
group.barrier()
|
||||
@@ -432,12 +391,7 @@ def test_proxy(
|
||||
@pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20]])
|
||||
@pytest.mark.parametrize("transport", ["NVLink", "IB"])
|
||||
@pytest.mark.parametrize("use_packet", [False, True])
|
||||
def test_simple_proxy_channel(
|
||||
mpi_group: MpiGroup,
|
||||
nelem: int,
|
||||
transport: str,
|
||||
use_packet: bool,
|
||||
):
|
||||
def test_simple_proxy_channel(mpi_group: MpiGroup, nelem: int, transport: str, use_packet: bool):
|
||||
group, connections = create_and_connect(mpi_group, transport)
|
||||
|
||||
memory = cp.zeros(nelem, dtype=cp.int32)
|
||||
|
||||
@@ -2,11 +2,18 @@
|
||||
// Licensed under the MIT license.
|
||||
|
||||
#include <mscclpp/core.hpp>
|
||||
#include <sstream>
|
||||
|
||||
#include "api.h"
|
||||
|
||||
namespace mscclpp {
|
||||
|
||||
MSCCLPP_API_CPP std::string version() {
|
||||
std::stringstream ss;
|
||||
ss << MSCCLPP_MAJOR << "." << MSCCLPP_MINOR << "." << MSCCLPP_PATCH;
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
MSCCLPP_API_CPP TransportFlags::TransportFlags(Transport transport)
|
||||
: detail::TransportFlagsBase(1 << static_cast<size_t>(transport)) {}
|
||||
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
// Copyright (c) Microsoft Corporation.
|
||||
// Licensed under the MIT license.
|
||||
|
||||
#ifndef MSCCL_COMMUNICATOR_HPP_
|
||||
#define MSCCL_COMMUNICATOR_HPP_
|
||||
#ifndef MSCCLPP_COMMUNICATOR_HPP_
|
||||
#define MSCCLPP_COMMUNICATOR_HPP_
|
||||
|
||||
#include <memory>
|
||||
#include <mscclpp/core.hpp>
|
||||
@@ -31,4 +31,4 @@ struct Communicator::Impl {
|
||||
|
||||
} // namespace mscclpp
|
||||
|
||||
#endif // MSCCL_COMMUNICATOR_HPP_
|
||||
#endif // MSCCLPP_COMMUNICATOR_HPP_
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
// Copyright (c) Microsoft Corporation.
|
||||
// Licensed under the MIT license.
|
||||
|
||||
#ifndef MSCCL_CONTEXT_HPP_
|
||||
#define MSCCL_CONTEXT_HPP_
|
||||
#ifndef MSCCLPP_CONTEXT_HPP_
|
||||
#define MSCCLPP_CONTEXT_HPP_
|
||||
|
||||
#include <mscclpp/core.hpp>
|
||||
#include <mscclpp/cuda_utils.hpp>
|
||||
@@ -25,4 +25,4 @@ struct Context::Impl {
|
||||
|
||||
} // namespace mscclpp
|
||||
|
||||
#endif // MSCCL_CONTEXT_HPP_
|
||||
#endif // MSCCLPP_CONTEXT_HPP_
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
// Copyright (c) Microsoft Corporation.
|
||||
// Licensed under the MIT license.
|
||||
|
||||
#ifndef MSCCL_ENDPOINT_HPP_
|
||||
#define MSCCL_ENDPOINT_HPP_
|
||||
#ifndef MSCCLPP_ENDPOINT_HPP_
|
||||
#define MSCCLPP_ENDPOINT_HPP_
|
||||
|
||||
#include <mscclpp/core.hpp>
|
||||
#include <vector>
|
||||
@@ -26,4 +26,4 @@ struct Endpoint::Impl {
|
||||
|
||||
} // namespace mscclpp
|
||||
|
||||
#endif // MSCCL_ENDPOINT_HPP_
|
||||
#endif // MSCCLPP_ENDPOINT_HPP_
|
||||
|
||||
@@ -16,17 +16,9 @@ def load_perf_file(perf_fine: str) -> dict:
|
||||
"time": data["time"],
|
||||
}
|
||||
if "target" in data:
|
||||
res[
|
||||
(
|
||||
data["name"],
|
||||
data["kernel"],
|
||||
data["ranks"],
|
||||
data["ranksPerNode"],
|
||||
data["size"],
|
||||
)
|
||||
][
|
||||
res[(data["name"], data["kernel"], data["ranks"], data["ranksPerNode"], data["size"])]["target"] = data[
|
||||
"target"
|
||||
] = data["target"]
|
||||
]
|
||||
return res
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user