This commit is contained in:
Changho Hwang
2023-10-11 22:35:54 +08:00
committed by GitHub
parent 11ac824cc7
commit 8c0f9e84d0
21 changed files with 133 additions and 162 deletions

View File

@@ -12,7 +12,7 @@ jobs:
name: Analyze
runs-on: 'ubuntu-latest'
container:
image: ghcr.io/microsoft/mscclpp/mscclpp:base-${{ matrix.cuda-version }}
image: ghcr.io/microsoft/mscclpp/mscclpp:dev-${{ matrix.cuda-version }}
permissions:
actions: read
@@ -27,7 +27,7 @@ jobs:
steps:
- name: Checkout repository
uses: actions/checkout@v3
uses: actions/checkout@v4
- name: Check disk space
run: |
@@ -38,12 +38,6 @@ jobs:
with:
languages: ${{ matrix.language }}
- name: Install cmake
run: |
curl -L https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz
tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp
sudo ln -s /tmp/cmake-3.26.4-linux-x86_64/bin/cmake /usr/bin/cmake
- name: Dubious ownership exception
run: |
git config --global --add safe.directory /__w/mscclpp/mscclpp

View File

@@ -4,7 +4,7 @@ on: workflow_dispatch
jobs:
IntegrationTest:
runs-on: self-hosted
runs-on: [ self-hosted, A100 ]
defaults:
run:
shell: bash
@@ -13,22 +13,17 @@ jobs:
cuda: [ cuda11.8, cuda12.1 ]
container:
image: "ghcr.io/microsoft/mscclpp/mscclpp:base-${{ matrix.cuda }}"
image: "ghcr.io/microsoft/mscclpp/mscclpp:dev-${{ matrix.cuda }}"
options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Install CMake
run: |
curl -L https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz
tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp
- name: Build
run: |
mkdir build && cd build
MPI_HOME=/usr/local/mpi /tmp/cmake-3.26.4-linux-x86_64/bin/cmake -DCMAKE_BUILD_TYPE=Release ..
MPI_HOME=/usr/local/mpi cmake -DCMAKE_BUILD_TYPE=Release ..
make -j
- name: Lock GPU clock frequency
@@ -41,7 +36,6 @@ jobs:
- name: Run mscclpp AllGather test
run: |
set -e
export PATH=/usr/local/mpi/bin:$PATH
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
@@ -50,13 +44,11 @@ jobs:
- name: Run mscclpp SendRecv test
run: |
set -e
export PATH=/usr/local/mpi/bin:$PATH
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl
- name: Run mscclpp AllReduce test
run: |
set -e
export PATH=/usr/local/mpi/bin:$PATH
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
@@ -68,7 +60,6 @@ jobs:
- name: Run mscclpp AllToAll test
run: |
set -e
export PATH=/usr/local/mpi/bin:$PATH
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl

View File

@@ -11,7 +11,7 @@ jobs:
steps:
- name: Check out Git repository
uses: actions/checkout@v3
uses: actions/checkout@v4
- name: Install ClangFormat
run: |
@@ -28,25 +28,25 @@ jobs:
steps:
- name: Check out Git repository
uses: actions/checkout@v3
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: 3.8
python-version: 3
- name: Install Python dependencies
run: python3.8 -m pip install black
run: python3 -m pip install black
- name: Run black
run: python3.8 -m black --check --config pyproject.toml .
run: python3 -m black --check --config pyproject.toml .
spelling:
runs-on: ubuntu-20.04
steps:
- name: Check out Git repository
uses: actions/checkout@v3
uses: actions/checkout@v4
- name: Download misspell
run: |

View File

@@ -4,7 +4,7 @@ on: workflow_dispatch
jobs:
UnitTest:
runs-on: self-hosted
runs-on: [ self-hosted, A100 ]
defaults:
run:
shell: bash
@@ -14,7 +14,7 @@ jobs:
cuda: [ cuda11.8, cuda12.1 ]
container:
image: "ghcr.io/microsoft/mscclpp/mscclpp:base-${{ matrix.cuda }}"
image: "ghcr.io/microsoft/mscclpp/mscclpp:dev-${{ matrix.cuda }}"
options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1
steps:
@@ -23,10 +23,8 @@ jobs:
- name: Build
run: |
curl -L -C- https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz
tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp
mkdir build && cd build
MPI_HOME=/usr/local/mpi /tmp/cmake-3.26.4-linux-x86_64/bin/cmake -DCMAKE_BUILD_TYPE=Release ..
MPI_HOME=/usr/local/mpi cmake -DCMAKE_BUILD_TYPE=Release ..
make -j
working-directory: ${{ github.workspace }}
@@ -36,31 +34,20 @@ jobs:
for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
sudo nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i
done
working-directory: ${{ github.workspace }}
- name: UnitTests
run: |
./build/test/unit_tests
working-directory: ${{ github.workspace }}
- name: MpUnitTests
run: |
set -e
export PATH=/usr/local/mpi/bin:$PATH
mpirun --allow-run-as-root -tag-output -np 2 ./build/test/mp_unit_tests
mpirun --allow-run-as-root -tag-output -np 4 ./build/test/mp_unit_tests
mpirun --allow-run-as-root -tag-output -np 8 ./build/test/mp_unit_tests
working-directory: ${{ github.workspace }}
- name: PyTests
run: |
set -e
export PATH=/usr/local/mpi/bin:$PATH
cd build && make pylib-copy
if [[ '${{ matrix.cuda }}' == 'cuda11'* ]]; then
python3 -m pip install -r ../python/test/requirements_cu11.txt
else
python3 -m pip install -r ../python/test/requirements_cu12.txt
fi
mpirun --allow-run-as-root -tag-output -np 8 $(which pytest) ../python/test/test_mscclpp.py -x
working-directory: ${{ github.workspace }}

View File

@@ -2,7 +2,7 @@
# Licensed under the MIT license.
set(MSCCLPP_MAJOR "0")
set(MSCCLPP_MINOR "2")
set(MSCCLPP_MINOR "3")
set(MSCCLPP_PATCH "0")
set(MSCCLPP_SOVERSION ${MSCCLPP_MAJOR})

View File

@@ -18,9 +18,9 @@ MSCCL++ is a development kit for implementing highly optimized distributed GPU a
* **Runtime Performance Optimization for Dynamic Workload.** As we can easily implement flexible communication logics, we can optimize communication performance even during runtime. For example, we can implement the system to automatically choose different communication paths or different collective communication algorithms depending on the dynamic workload at runtime.
## Key Features (v0.2)
## Key Features (v0.3)
MSCCL++ v0.2 supports the following features.
MSCCL++ v0.3 supports the following features.
### In-Kernel Communication Interfaces
@@ -124,31 +124,15 @@ Customized proxies can be used for conducting a series of pre-defined data trans
Most of key components of MSCCL++ are designed to be easily customized. This enables MSCCL++ to easily adopt a new software / hardware technology and lets users implement algorithms optimized for their own use cases.
## Status & Roadmap
### New in MSCCL++ v0.3 (Latest Release)
* Updated interfaces
* Add Python bindings and interfaces
* Add Python unit tests
* Add more configurable parameters
* Add a new single-node AllReduce kernel
* Fix bugs
MSCCL++ is under active development and a part of its features will be added in a future release. The following describes key features of each version.
### MSCCL++ v0.4 (TBU)
* Automatic task scheduler
* Dynamic performance tuning
### MSCCL++ v0.3 (TBU)
* Tile-based communication: efficient transport of 2D data patches (tiles)
* GPU computation interfaces
### MSCCL++ v0.2 (Latest Release)
* Basic communication functionalities and new interfaces
- GPU-side communication interfaces
- Host-side helpers: bootstrap, communicator, and proxy
- Supports both NVLink and InfiniBand
- Supports both in-SM copy and DMA/RDMA
* Communication performance optimization
- Example code outperforms NCCL/MSCCL AllGather/AllReduce/AllToAll
* Development pipeline
* Documentation
### MSCCL++ v0.1
* Proof-of-concept, preliminary interfaces
See details from https://github.com/microsoft/mscclpp/issues/89.
## Contributing

View File

@@ -5,7 +5,10 @@ LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp
ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update && \
RUN rm -rf /opt/nvidia
RUN apt-get clean && \
apt-get update && \
apt-get install -y --no-install-recommends \
build-essential \
ca-certificates \
@@ -47,8 +50,10 @@ RUN cd /tmp && \
cd .. && \
rm -rf /tmp/openmpi-${OPENMPI_VERSION}*
ENV PATH="${PATH}:/usr/local/mpi/bin" \
LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/mpi/lib:/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64"
ENV PATH="/usr/local/mpi/bin:${PATH}" \
LD_LIBRARY_PATH="/usr/local/mpi/lib:/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64:${LD_LIBRARY_PATH}"
RUN echo PATH="${PATH}" > /etc/environment && \
echo LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" >> /etc/environment
ENTRYPOINT []

View File

@@ -0,0 +1,28 @@
FROM ghcr.io/microsoft/mscclpp/mscclpp:base-cuda11.8
LABEL maintainer="MSCCL++"
LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp
ENV MSCCLPP_SRC_DIR="/tmp/mscclpp" \
CMAKE_VERSION="3.26.4"
ADD . ${MSCCLPP_SRC_DIR}
WORKDIR ${MSCCLPP_SRC_DIR}
# Install cmake 3.26.4
ENV CMAKE_HOME="/tmp/cmake-${CMAKE_VERSION}-linux-x86_64" \
CMAKE_URL="https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz"
RUN curl -L ${CMAKE_URL} -o ${CMAKE_HOME}.tar.gz && \
tar xzf ${CMAKE_HOME}.tar.gz -C /usr/local && \
rm -rf ${CMAKE_HOME}.tar.gz
ENV PATH="/usr/local/cmake-${CMAKE_VERSION}-linux-x86_64/bin:${PATH}"
# Install pytest & dependencies
RUN python3 -m pip install --no-cache-dir -r python/test/requirements_cu11.txt
# Set PATH
RUN echo PATH="${PATH}" > /etc/environment
# Cleanup
WORKDIR /
RUN rm -rf ${MSCCLPP_SRC_DIR}

View File

@@ -0,0 +1,27 @@
FROM ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1
LABEL maintainer="MSCCL++"
LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp
ENV MSCCLPP_SRC_DIR="/tmp/mscclpp" \
CMAKE_VERSION="3.26.4"
ADD . ${MSCCLPP_SRC_DIR}
WORKDIR ${MSCCLPP_SRC_DIR}
# Install cmake 3.26.4
ENV CMAKE_HOME="/tmp/cmake-${CMAKE_VERSION}-linux-x86_64" \
CMAKE_URL="https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz"
RUN curl -L ${CMAKE_URL} -o ${CMAKE_HOME}.tar.gz && \
tar xzf ${CMAKE_HOME}.tar.gz -C /usr/local
ENV PATH="/usr/local/cmake-${CMAKE_VERSION}-linux-x86_64/bin:${PATH}"
# Install pytest & dependencies
RUN python3 -m pip install --no-cache-dir -r python/test/requirements_cu12.txt
# Set PATH
RUN echo PATH="${PATH}" > /etc/environment
# Cleanup
WORKDIR /
RUN rm -rf ${MSCCLPP_SRC_DIR}

View File

@@ -5,7 +5,7 @@
#define MSCCLPP_CORE_HPP_
#define MSCCLPP_MAJOR 0
#define MSCCLPP_MINOR 2
#define MSCCLPP_MINOR 3
#define MSCCLPP_PATCH 0
#define MSCCLPP_VERSION (MSCCLPP_MAJOR * 10000 + MSCCLPP_MINOR * 100 + MSCCLPP_PATCH)
@@ -24,6 +24,9 @@ namespace mscclpp {
/// Unique ID for a process. This is a MSCCLPP_UNIQUE_ID_BYTES byte array that uniquely identifies a process.
using UniqueId = std::array<uint8_t, MSCCLPP_UNIQUE_ID_BYTES>;
/// Return a version string.
std::string version();
/// Base class for bootstraps.
class Bootstrap {
public:

View File

@@ -7,7 +7,7 @@ build-backend = "scikit_build_core.build"
[project]
name = "mscclpp"
version = "0.2.0"
version = "0.3.0"
[tool.scikit-build]
cmake.minimum-version = "3.25.0"

View File

@@ -7,11 +7,10 @@ add_subdirectory(test)
add_custom_target(pylib-copy)
add_custom_command(TARGET pylib-copy POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy_if_different
${CMAKE_CURRENT_BINARY_DIR}/mscclpp/_mscclpp.cpython-38-x86_64-linux-gnu.so
${CMAKE_CURRENT_BINARY_DIR}/mscclpp/_mscclpp.*.so
${CMAKE_CURRENT_SOURCE_DIR}/mscclpp
COMMAND ${CMAKE_COMMAND} -E copy_if_different
${CMAKE_CURRENT_BINARY_DIR}/test/_ext.cpython-38-x86_64-linux-gnu.so
${CMAKE_CURRENT_BINARY_DIR}/test/_ext.*.so
${CMAKE_CURRENT_SOURCE_DIR}/test/_cpp
COMMAND ${CMAKE_COMMAND} -E echo "Copy python libraries"
)

View File

@@ -18,8 +18,11 @@ from ._mscclpp import (
TcpBootstrap,
Transport,
TransportFlags,
version,
)
__version__ = version()
def get_include():
"""Return the directory that contains the MSCCL++ headers."""

View File

@@ -29,6 +29,8 @@ void def_nonblocking_future(nb::handle& m, const std::string& typestr) {
}
void register_core(nb::module_& m) {
m.def("version", &version);
nb::class_<Bootstrap>(m, "Bootstrap")
.def("get_rank", &Bootstrap::getRank)
.def("get_n_ranks", &Bootstrap::getNranks)

View File

@@ -127,10 +127,7 @@ class MscclppGroup:
channels = {}
for rank in connections:
channels[rank] = SmChannel(
semaphores[rank],
registered_memories[rank],
tensor.data.ptr,
packetTensor.data.ptr,
semaphores[rank], registered_memories[rank], tensor.data.ptr, packetTensor.data.ptr
)
return channels
@@ -148,8 +145,6 @@ class MscclppGroup:
channels = {}
for rank in semaphores:
channels[rank] = SimpleProxyChannel(
proxy_service.proxy_channel(semaphore_ids[rank]),
memory_ids[rank],
memory_ids[self.my_rank],
proxy_service.proxy_channel(semaphore_ids[rank]), memory_ids[rank], memory_ids[self.my_rank]
)
return channels

View File

@@ -9,14 +9,7 @@ import numpy as np
import netifaces as ni
import pytest
from mscclpp import (
Fifo,
Host2DeviceSemaphore,
Host2HostSemaphore,
ProxyService,
SmDevice2DeviceSemaphore,
Transport,
)
from mscclpp import Fifo, Host2DeviceSemaphore, Host2HostSemaphore, ProxyService, SmDevice2DeviceSemaphore, Transport
from ._cpp import _ext
from .mscclpp_group import MscclppGroup
from .mscclpp_mpi import MpiGroup, parametrize_mpi_groups, mpi_group
@@ -61,11 +54,7 @@ def test_group_with_ip(mpi_group: MpiGroup, ifIpPortTrio: str):
for rank in range(group.nranks):
if rank == group.my_rank:
continue
group.send(
memory[(nelemPerRank * group.my_rank) : (nelemPerRank * (group.my_rank + 1))],
rank,
0,
)
group.send(memory[(nelemPerRank * group.my_rank) : (nelemPerRank * (group.my_rank + 1))], rank, 0)
for rank in range(group.nranks):
if rank == group.my_rank:
continue
@@ -207,43 +196,31 @@ class MscclppKernel:
):
if test_name == "h2d_semaphore":
self._kernel = KernelBuilder(
file="h2d_semaphore_test.cu",
kernel_name="h2d_semaphore",
file="h2d_semaphore_test.cu", kernel_name="h2d_semaphore"
).get_compiled_kernel()
self.nblocks = 1
self.nthreads = nranks
elif test_name == "d2d_semaphore":
self._kernel = KernelBuilder(
file="d2d_semaphore_test.cu",
kernel_name="d2d_semaphore",
file="d2d_semaphore_test.cu", kernel_name="d2d_semaphore"
).get_compiled_kernel()
self.nblocks = 1
self.nthreads = nranks
elif test_name == "sm_channel":
self._kernel = KernelBuilder(
file="sm_channel_test.cu",
kernel_name="sm_channel",
).get_compiled_kernel()
self._kernel = KernelBuilder(file="sm_channel_test.cu", kernel_name="sm_channel").get_compiled_kernel()
self.nblocks = nranks
self.nthreads = 1024
elif test_name == "fifo":
self._kernel = KernelBuilder(
file="fifo_test.cu",
kernel_name="fifo",
).get_compiled_kernel()
self._kernel = KernelBuilder(file="fifo_test.cu", kernel_name="fifo").get_compiled_kernel()
self.nblocks = 1
self.nthreads = 1
elif test_name == "proxy":
self._kernel = KernelBuilder(
file="proxy_test.cu",
kernel_name="proxy",
).get_compiled_kernel()
self._kernel = KernelBuilder(file="proxy_test.cu", kernel_name="proxy").get_compiled_kernel()
self.nblocks = 1
self.nthreads = nranks
elif test_name == "simple_proxy_channel":
self._kernel = KernelBuilder(
file="simple_proxy_channel_test.cu",
kernel_name="simple_proxy_channel",
file="simple_proxy_channel_test.cu", kernel_name="simple_proxy_channel"
).get_compiled_kernel()
self.nblocks = 1
self.nthreads = 1024
@@ -364,17 +341,10 @@ def test_fifo(
@parametrize_mpi_groups(2, 4, 8, 16)
@pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20]])
@pytest.mark.parametrize("transport", ["IB", "NVLink"])
def test_proxy(
mpi_group: MpiGroup,
nelem: int,
transport: str,
):
def test_proxy(mpi_group: MpiGroup, nelem: int, transport: str):
group, connections = create_and_connect(mpi_group, transport)
memory = cp.zeros(
nelem,
dtype=cp.int32,
)
memory = cp.zeros(nelem, dtype=cp.int32)
nelemPerRank = nelem // group.nranks
nelemPerRank * memory.itemsize
memory[(nelemPerRank * group.my_rank) : (nelemPerRank * (group.my_rank + 1))] = group.my_rank + 1
@@ -401,23 +371,12 @@ def test_proxy(
list_reg_mem.append(all_reg_memories[rank])
proxy = _ext.MyProxyService(
group.my_rank,
group.nranks,
nelem * memory.itemsize,
list_conn,
list_reg_mem,
list_sem,
)
proxy = _ext.MyProxyService(group.my_rank, group.nranks, nelem * memory.itemsize, list_conn, list_reg_mem, list_sem)
fifo_device_handle = proxy.fifo_device_handle()
kernel = MscclppKernel(
"proxy",
my_rank=group.my_rank,
nranks=group.nranks,
semaphore_or_channels=list_sem,
fifo=fifo_device_handle,
"proxy", my_rank=group.my_rank, nranks=group.nranks, semaphore_or_channels=list_sem, fifo=fifo_device_handle
)
proxy.start()
group.barrier()
@@ -432,12 +391,7 @@ def test_proxy(
@pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20]])
@pytest.mark.parametrize("transport", ["NVLink", "IB"])
@pytest.mark.parametrize("use_packet", [False, True])
def test_simple_proxy_channel(
mpi_group: MpiGroup,
nelem: int,
transport: str,
use_packet: bool,
):
def test_simple_proxy_channel(mpi_group: MpiGroup, nelem: int, transport: str, use_packet: bool):
group, connections = create_and_connect(mpi_group, transport)
memory = cp.zeros(nelem, dtype=cp.int32)

View File

@@ -2,11 +2,18 @@
// Licensed under the MIT license.
#include <mscclpp/core.hpp>
#include <sstream>
#include "api.h"
namespace mscclpp {
MSCCLPP_API_CPP std::string version() {
std::stringstream ss;
ss << MSCCLPP_MAJOR << "." << MSCCLPP_MINOR << "." << MSCCLPP_PATCH;
return ss.str();
}
MSCCLPP_API_CPP TransportFlags::TransportFlags(Transport transport)
: detail::TransportFlagsBase(1 << static_cast<size_t>(transport)) {}

View File

@@ -1,8 +1,8 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
#ifndef MSCCL_COMMUNICATOR_HPP_
#define MSCCL_COMMUNICATOR_HPP_
#ifndef MSCCLPP_COMMUNICATOR_HPP_
#define MSCCLPP_COMMUNICATOR_HPP_
#include <memory>
#include <mscclpp/core.hpp>
@@ -31,4 +31,4 @@ struct Communicator::Impl {
} // namespace mscclpp
#endif // MSCCL_COMMUNICATOR_HPP_
#endif // MSCCLPP_COMMUNICATOR_HPP_

View File

@@ -1,8 +1,8 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
#ifndef MSCCL_CONTEXT_HPP_
#define MSCCL_CONTEXT_HPP_
#ifndef MSCCLPP_CONTEXT_HPP_
#define MSCCLPP_CONTEXT_HPP_
#include <mscclpp/core.hpp>
#include <mscclpp/cuda_utils.hpp>
@@ -25,4 +25,4 @@ struct Context::Impl {
} // namespace mscclpp
#endif // MSCCL_CONTEXT_HPP_
#endif // MSCCLPP_CONTEXT_HPP_

View File

@@ -1,8 +1,8 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.
#ifndef MSCCL_ENDPOINT_HPP_
#define MSCCL_ENDPOINT_HPP_
#ifndef MSCCLPP_ENDPOINT_HPP_
#define MSCCLPP_ENDPOINT_HPP_
#include <mscclpp/core.hpp>
#include <vector>
@@ -26,4 +26,4 @@ struct Endpoint::Impl {
} // namespace mscclpp
#endif // MSCCL_ENDPOINT_HPP_
#endif // MSCCLPP_ENDPOINT_HPP_

View File

@@ -16,17 +16,9 @@ def load_perf_file(perf_fine: str) -> dict:
"time": data["time"],
}
if "target" in data:
res[
(
data["name"],
data["kernel"],
data["ranks"],
data["ranksPerNode"],
data["size"],
)
][
res[(data["name"], data["kernel"], data["ranks"], data["ranksPerNode"], data["size"])]["target"] = data[
"target"
] = data["target"]
]
return res