v0.3.0 (#171)

2026-04-20 06:49:29 +00:00 · 2023-10-11 22:35:54 +08:00
parent 11ac824cc7
commit 8c0f9e84d0
21 changed files with 133 additions and 162 deletions
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -12,7 +12,7 @@ jobs:
    name: Analyze
    runs-on: 'ubuntu-latest'
    container:
-      image: ghcr.io/microsoft/mscclpp/mscclpp:base-${{ matrix.cuda-version }}
+      image: ghcr.io/microsoft/mscclpp/mscclpp:dev-${{ matrix.cuda-version }}

    permissions:
      actions: read
@@ -27,7 +27,7 @@ jobs:

    steps:
    - name: Checkout repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4

    - name: Check disk space
      run: |
@@ -38,12 +38,6 @@ jobs:
      with:
        languages: ${{ matrix.language }}

-    - name: Install cmake
-      run: |
-        curl -L https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz
-        tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp
-        sudo ln -s /tmp/cmake-3.26.4-linux-x86_64/bin/cmake /usr/bin/cmake
-
    - name: Dubious ownership exception
      run: |
        git config --global --add safe.directory /__w/mscclpp/mscclpp
--- a/.github/workflows/integration-test-backup.yml
+++ b/.github/workflows/integration-test-backup.yml
@@ -4,7 +4,7 @@ on: workflow_dispatch

 jobs:
  IntegrationTest:
-    runs-on: self-hosted
+    runs-on: [ self-hosted, A100 ]
    defaults:
      run:
        shell: bash
@@ -13,22 +13,17 @@ jobs:
        cuda: [ cuda11.8, cuda12.1 ]

    container:
-      image: "ghcr.io/microsoft/mscclpp/mscclpp:base-${{ matrix.cuda }}"
+      image: "ghcr.io/microsoft/mscclpp/mscclpp:dev-${{ matrix.cuda }}"
      options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1

    steps:
      - name: Checkout
        uses: actions/checkout@v4

-      - name: Install CMake
-        run: |
-          curl -L https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz
-          tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp
-
      - name: Build
        run: |
          mkdir build && cd build
-          MPI_HOME=/usr/local/mpi /tmp/cmake-3.26.4-linux-x86_64/bin/cmake -DCMAKE_BUILD_TYPE=Release ..
+          MPI_HOME=/usr/local/mpi cmake -DCMAKE_BUILD_TYPE=Release ..
          make -j

      - name: Lock GPU clock frequency
@@ -41,7 +36,6 @@ jobs:
      - name: Run mscclpp AllGather test
        run: |
          set -e
-          export PATH=/usr/local/mpi/bin:$PATH
          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl
          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
@@ -50,13 +44,11 @@ jobs:
      - name: Run mscclpp SendRecv test
        run: |
          set -e
-          export PATH=/usr/local/mpi/bin:$PATH
          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl

      - name: Run mscclpp AllReduce test
        run: |
          set -e
-          export PATH=/usr/local/mpi/bin:$PATH
          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl
          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
@@ -68,7 +60,6 @@ jobs:
      - name: Run mscclpp AllToAll test
        run: |
          set -e
-          export PATH=/usr/local/mpi/bin:$PATH
          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl
          mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl

--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -11,7 +11,7 @@ jobs:

    steps:
    - name: Check out Git repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4

    - name: Install ClangFormat
      run: |
@@ -28,25 +28,25 @@ jobs:

    steps:
      - name: Check out Git repository
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
  
      - name: Set up Python
        uses: actions/setup-python@v4
        with:
-          python-version: 3.8
+          python-version: 3

      - name: Install Python dependencies
-        run: python3.8 -m pip install black
+        run: python3 -m pip install black

      - name: Run black
-        run: python3.8 -m black --check --config pyproject.toml .
+        run: python3 -m black --check --config pyproject.toml .

  spelling:
    runs-on: ubuntu-20.04

    steps:
    - name: Check out Git repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4

    - name: Download misspell
      run: |
--- a/.github/workflows/ut-backup.yml
+++ b/.github/workflows/ut-backup.yml
@@ -4,7 +4,7 @@ on: workflow_dispatch

 jobs:
  UnitTest:
-    runs-on: self-hosted
+    runs-on: [ self-hosted, A100 ]
    defaults:
      run:
        shell: bash
@@ -14,7 +14,7 @@ jobs:
        cuda: [ cuda11.8, cuda12.1 ]

    container:
-      image: "ghcr.io/microsoft/mscclpp/mscclpp:base-${{ matrix.cuda }}"
+      image: "ghcr.io/microsoft/mscclpp/mscclpp:dev-${{ matrix.cuda }}"
      options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1

    steps:
@@ -23,10 +23,8 @@ jobs:

      - name: Build
        run: |
-          curl -L -C- https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz
-          tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp
          mkdir build && cd build
-          MPI_HOME=/usr/local/mpi /tmp/cmake-3.26.4-linux-x86_64/bin/cmake -DCMAKE_BUILD_TYPE=Release ..
+          MPI_HOME=/usr/local/mpi cmake -DCMAKE_BUILD_TYPE=Release ..
          make -j
        working-directory: ${{ github.workspace }}

@@ -36,31 +34,20 @@ jobs:
          for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
            sudo nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i
          done
-        working-directory: ${{ github.workspace }}

      - name: UnitTests
        run: |
          ./build/test/unit_tests
-        working-directory: ${{ github.workspace }}

      - name: MpUnitTests
        run: |
          set -e
-          export PATH=/usr/local/mpi/bin:$PATH
          mpirun --allow-run-as-root -tag-output -np 2 ./build/test/mp_unit_tests
          mpirun --allow-run-as-root -tag-output -np 4 ./build/test/mp_unit_tests
          mpirun --allow-run-as-root -tag-output -np 8 ./build/test/mp_unit_tests
-        working-directory: ${{ github.workspace }}

      - name: PyTests
        run: |
          set -e
-          export PATH=/usr/local/mpi/bin:$PATH
          cd build && make pylib-copy
-          if [[ '${{ matrix.cuda }}' == 'cuda11'* ]]; then
-            python3 -m pip install -r ../python/test/requirements_cu11.txt
-          else
-            python3 -m pip install -r ../python/test/requirements_cu12.txt
-          fi
          mpirun --allow-run-as-root -tag-output -np 8 $(which pytest) ../python/test/test_mscclpp.py -x
-        working-directory: ${{ github.workspace }}
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,7 +2,7 @@
 # Licensed under the MIT license.

 set(MSCCLPP_MAJOR "0")
-set(MSCCLPP_MINOR "2")
+set(MSCCLPP_MINOR "3")
 set(MSCCLPP_PATCH "0")

 set(MSCCLPP_SOVERSION ${MSCCLPP_MAJOR})
--- a/README.md
+++ b/README.md
@@ -18,9 +18,9 @@ MSCCL++ is a development kit for implementing highly optimized distributed GPU a

 * **Runtime Performance Optimization for Dynamic Workload.** As we can easily implement flexible communication logics, we can optimize communication performance even during runtime. For example, we can implement the system to automatically choose different communication paths or different collective communication algorithms depending on the dynamic workload at runtime.

-## Key Features (v0.2)
+## Key Features (v0.3)

-MSCCL++ v0.2 supports the following features.
+MSCCL++ v0.3 supports the following features.

 ### In-Kernel Communication Interfaces

@@ -124,31 +124,15 @@ Customized proxies can be used for conducting a series of pre-defined data trans

 Most of key components of MSCCL++ are designed to be easily customized. This enables MSCCL++ to easily adopt a new software / hardware technology and lets users implement algorithms optimized for their own use cases.

-## Status & Roadmap
+### New in MSCCL++ v0.3 (Latest Release)
+* Updated interfaces
+* Add Python bindings and interfaces
+* Add Python unit tests
+* Add more configurable parameters
+* Add a new single-node AllReduce kernel
+* Fix bugs

-MSCCL++ is under active development and a part of its features will be added in a future release. The following describes key features of each version.
-
-### MSCCL++ v0.4 (TBU)
-* Automatic task scheduler
-* Dynamic performance tuning
-
-### MSCCL++ v0.3 (TBU)
-* Tile-based communication: efficient transport of 2D data patches (tiles)
-* GPU computation interfaces
-
-### MSCCL++ v0.2 (Latest Release)
-* Basic communication functionalities and new interfaces
-    - GPU-side communication interfaces
-    - Host-side helpers: bootstrap, communicator, and proxy
-    - Supports both NVLink and InfiniBand
-    - Supports both in-SM copy and DMA/RDMA
-* Communication performance optimization
-    - Example code outperforms NCCL/MSCCL AllGather/AllReduce/AllToAll
-* Development pipeline
-* Documentation
-
-### MSCCL++ v0.1
-* Proof-of-concept, preliminary interfaces
+See details from https://github.com/microsoft/mscclpp/issues/89.

 ## Contributing

--- a/docker/base-cuda12.1.dockerfile
+++ b/docker/base-cuda12.1.dockerfile
@@ -5,7 +5,10 @@ LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp

 ENV DEBIAN_FRONTEND=noninteractive

-RUN apt-get update && \
+RUN rm -rf /opt/nvidia
+
+RUN apt-get clean && \
+    apt-get update && \
    apt-get install -y --no-install-recommends \
        build-essential \
        ca-certificates \
@@ -47,8 +50,10 @@ RUN cd /tmp && \
    cd .. && \
    rm -rf /tmp/openmpi-${OPENMPI_VERSION}*

-ENV PATH="${PATH}:/usr/local/mpi/bin" \
-    LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/mpi/lib:/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64"
+ENV PATH="/usr/local/mpi/bin:${PATH}" \
+    LD_LIBRARY_PATH="/usr/local/mpi/lib:/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64:${LD_LIBRARY_PATH}"

 RUN echo PATH="${PATH}" > /etc/environment && \
    echo LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" >> /etc/environment
+
+ENTRYPOINT []
--- a/docker/dev-cuda11.8.dockerfile
+++ b/docker/dev-cuda11.8.dockerfile
@@ -0,0 +1,28 @@
+FROM ghcr.io/microsoft/mscclpp/mscclpp:base-cuda11.8
+
+LABEL maintainer="MSCCL++"
+LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp
+
+ENV MSCCLPP_SRC_DIR="/tmp/mscclpp" \
+    CMAKE_VERSION="3.26.4"
+
+ADD . ${MSCCLPP_SRC_DIR}
+WORKDIR ${MSCCLPP_SRC_DIR}
+
+# Install cmake 3.26.4
+ENV CMAKE_HOME="/tmp/cmake-${CMAKE_VERSION}-linux-x86_64" \
+    CMAKE_URL="https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz"
+RUN curl -L ${CMAKE_URL} -o ${CMAKE_HOME}.tar.gz && \
+    tar xzf ${CMAKE_HOME}.tar.gz -C /usr/local && \
+    rm -rf ${CMAKE_HOME}.tar.gz
+ENV PATH="/usr/local/cmake-${CMAKE_VERSION}-linux-x86_64/bin:${PATH}"
+
+# Install pytest & dependencies
+RUN python3 -m pip install --no-cache-dir -r python/test/requirements_cu11.txt
+
+# Set PATH
+RUN echo PATH="${PATH}" > /etc/environment
+
+# Cleanup
+WORKDIR /
+RUN rm -rf ${MSCCLPP_SRC_DIR}
--- a/docker/dev-cuda12.1.dockerfile
+++ b/docker/dev-cuda12.1.dockerfile
@@ -0,0 +1,27 @@
+FROM ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1
+
+LABEL maintainer="MSCCL++"
+LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp
+
+ENV MSCCLPP_SRC_DIR="/tmp/mscclpp" \
+    CMAKE_VERSION="3.26.4"
+
+ADD . ${MSCCLPP_SRC_DIR}
+WORKDIR ${MSCCLPP_SRC_DIR}
+
+# Install cmake 3.26.4
+ENV CMAKE_HOME="/tmp/cmake-${CMAKE_VERSION}-linux-x86_64" \
+    CMAKE_URL="https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz"
+RUN curl -L ${CMAKE_URL} -o ${CMAKE_HOME}.tar.gz && \
+    tar xzf ${CMAKE_HOME}.tar.gz -C /usr/local
+ENV PATH="/usr/local/cmake-${CMAKE_VERSION}-linux-x86_64/bin:${PATH}"
+
+# Install pytest & dependencies
+RUN python3 -m pip install --no-cache-dir -r python/test/requirements_cu12.txt
+
+# Set PATH
+RUN echo PATH="${PATH}" > /etc/environment
+
+# Cleanup
+WORKDIR /
+RUN rm -rf ${MSCCLPP_SRC_DIR}
--- a/include/mscclpp/core.hpp
+++ b/include/mscclpp/core.hpp
@@ -5,7 +5,7 @@
 #define MSCCLPP_CORE_HPP_

 #define MSCCLPP_MAJOR 0
-#define MSCCLPP_MINOR 2
+#define MSCCLPP_MINOR 3
 #define MSCCLPP_PATCH 0
 #define MSCCLPP_VERSION (MSCCLPP_MAJOR * 10000 + MSCCLPP_MINOR * 100 + MSCCLPP_PATCH)

@@ -24,6 +24,9 @@ namespace mscclpp {
 /// Unique ID for a process. This is a MSCCLPP_UNIQUE_ID_BYTES byte array that uniquely identifies a process.
 using UniqueId = std::array<uint8_t, MSCCLPP_UNIQUE_ID_BYTES>;

+/// Return a version string.
+std::string version();
+
 /// Base class for bootstraps.
 class Bootstrap {
 public:
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,7 +7,7 @@ build-backend = "scikit_build_core.build"

 [project]
 name = "mscclpp"
-version = "0.2.0"
+version = "0.3.0"

 [tool.scikit-build]
 cmake.minimum-version = "3.25.0"
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -7,11 +7,10 @@ add_subdirectory(test)
 add_custom_target(pylib-copy)
 add_custom_command(TARGET pylib-copy POST_BUILD
    COMMAND ${CMAKE_COMMAND} -E copy_if_different
-        ${CMAKE_CURRENT_BINARY_DIR}/mscclpp/_mscclpp.cpython-38-x86_64-linux-gnu.so
+        ${CMAKE_CURRENT_BINARY_DIR}/mscclpp/_mscclpp.*.so
        ${CMAKE_CURRENT_SOURCE_DIR}/mscclpp
    COMMAND ${CMAKE_COMMAND} -E copy_if_different
-        ${CMAKE_CURRENT_BINARY_DIR}/test/_ext.cpython-38-x86_64-linux-gnu.so
+        ${CMAKE_CURRENT_BINARY_DIR}/test/_ext.*.so
        ${CMAKE_CURRENT_SOURCE_DIR}/test/_cpp
    COMMAND ${CMAKE_COMMAND} -E echo "Copy python libraries"
 )
-
--- a/python/mscclpp/init.py
+++ b/python/mscclpp/init.py
@@ -18,8 +18,11 @@ from ._mscclpp import (
    TcpBootstrap,
    Transport,
    TransportFlags,
+    version,
 )

+__version__ = version()
+

 def get_include():
    """Return the directory that contains the MSCCL++ headers."""
--- a/python/mscclpp/core_py.cpp
+++ b/python/mscclpp/core_py.cpp
@@ -29,6 +29,8 @@ void def_nonblocking_future(nb::handle& m, const std::string& typestr) {
 }

 void register_core(nb::module_& m) {
+  m.def("version", &version);
+
  nb::class_<Bootstrap>(m, "Bootstrap")
      .def("get_rank", &Bootstrap::getRank)
      .def("get_n_ranks", &Bootstrap::getNranks)
--- a/python/test/mscclpp_group.py
+++ b/python/test/mscclpp_group.py
@@ -127,10 +127,7 @@ class MscclppGroup:
        channels = {}
        for rank in connections:
            channels[rank] = SmChannel(
-                semaphores[rank],
-                registered_memories[rank],
-                tensor.data.ptr,
-                packetTensor.data.ptr,
+                semaphores[rank], registered_memories[rank], tensor.data.ptr, packetTensor.data.ptr
            )
        return channels

@@ -148,8 +145,6 @@ class MscclppGroup:
        channels = {}
        for rank in semaphores:
            channels[rank] = SimpleProxyChannel(
-                proxy_service.proxy_channel(semaphore_ids[rank]),
-                memory_ids[rank],
-                memory_ids[self.my_rank],
+                proxy_service.proxy_channel(semaphore_ids[rank]), memory_ids[rank], memory_ids[self.my_rank]
            )
        return channels
--- a/python/test/test_mscclpp.py
+++ b/python/test/test_mscclpp.py
@@ -9,14 +9,7 @@ import numpy as np
 import netifaces as ni
 import pytest

-from mscclpp import (
-    Fifo,
-    Host2DeviceSemaphore,
-    Host2HostSemaphore,
-    ProxyService,
-    SmDevice2DeviceSemaphore,
-    Transport,
-)
+from mscclpp import Fifo, Host2DeviceSemaphore, Host2HostSemaphore, ProxyService, SmDevice2DeviceSemaphore, Transport
 from ._cpp import _ext
 from .mscclpp_group import MscclppGroup
 from .mscclpp_mpi import MpiGroup, parametrize_mpi_groups, mpi_group
@@ -61,11 +54,7 @@ def test_group_with_ip(mpi_group: MpiGroup, ifIpPortTrio: str):
    for rank in range(group.nranks):
        if rank == group.my_rank:
            continue
-        group.send(
-            memory[(nelemPerRank * group.my_rank) : (nelemPerRank * (group.my_rank + 1))],
-            rank,
-            0,
-        )
+        group.send(memory[(nelemPerRank * group.my_rank) : (nelemPerRank * (group.my_rank + 1))], rank, 0)
    for rank in range(group.nranks):
        if rank == group.my_rank:
            continue
@@ -207,43 +196,31 @@ class MscclppKernel:
    ):
        if test_name == "h2d_semaphore":
            self._kernel = KernelBuilder(
-                file="h2d_semaphore_test.cu",
-                kernel_name="h2d_semaphore",
+                file="h2d_semaphore_test.cu", kernel_name="h2d_semaphore"
            ).get_compiled_kernel()
            self.nblocks = 1
            self.nthreads = nranks
        elif test_name == "d2d_semaphore":
            self._kernel = KernelBuilder(
-                file="d2d_semaphore_test.cu",
-                kernel_name="d2d_semaphore",
+                file="d2d_semaphore_test.cu", kernel_name="d2d_semaphore"
            ).get_compiled_kernel()
            self.nblocks = 1
            self.nthreads = nranks
        elif test_name == "sm_channel":
-            self._kernel = KernelBuilder(
-                file="sm_channel_test.cu",
-                kernel_name="sm_channel",
-            ).get_compiled_kernel()
+            self._kernel = KernelBuilder(file="sm_channel_test.cu", kernel_name="sm_channel").get_compiled_kernel()
            self.nblocks = nranks
            self.nthreads = 1024
        elif test_name == "fifo":
-            self._kernel = KernelBuilder(
-                file="fifo_test.cu",
-                kernel_name="fifo",
-            ).get_compiled_kernel()
+            self._kernel = KernelBuilder(file="fifo_test.cu", kernel_name="fifo").get_compiled_kernel()
            self.nblocks = 1
            self.nthreads = 1
        elif test_name == "proxy":
-            self._kernel = KernelBuilder(
-                file="proxy_test.cu",
-                kernel_name="proxy",
-            ).get_compiled_kernel()
+            self._kernel = KernelBuilder(file="proxy_test.cu", kernel_name="proxy").get_compiled_kernel()
            self.nblocks = 1
            self.nthreads = nranks
        elif test_name == "simple_proxy_channel":
            self._kernel = KernelBuilder(
-                file="simple_proxy_channel_test.cu",
-                kernel_name="simple_proxy_channel",
+                file="simple_proxy_channel_test.cu", kernel_name="simple_proxy_channel"
            ).get_compiled_kernel()
            self.nblocks = 1
            self.nthreads = 1024
@@ -364,17 +341,10 @@ def test_fifo(
@parametrize_mpi_groups(2, 4, 8, 16)
@pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20]])
@pytest.mark.parametrize("transport", ["IB", "NVLink"])
-def test_proxy(
-    mpi_group: MpiGroup,
-    nelem: int,
-    transport: str,
-):
+def test_proxy(mpi_group: MpiGroup, nelem: int, transport: str):
    group, connections = create_and_connect(mpi_group, transport)

-    memory = cp.zeros(
-        nelem,
-        dtype=cp.int32,
-    )
+    memory = cp.zeros(nelem, dtype=cp.int32)
    nelemPerRank = nelem // group.nranks
    nelemPerRank * memory.itemsize
    memory[(nelemPerRank * group.my_rank) : (nelemPerRank * (group.my_rank + 1))] = group.my_rank + 1
@@ -401,23 +371,12 @@ def test_proxy(

        list_reg_mem.append(all_reg_memories[rank])

-    proxy = _ext.MyProxyService(
-        group.my_rank,
-        group.nranks,
-        nelem * memory.itemsize,
-        list_conn,
-        list_reg_mem,
-        list_sem,
-    )
+    proxy = _ext.MyProxyService(group.my_rank, group.nranks, nelem * memory.itemsize, list_conn, list_reg_mem, list_sem)

    fifo_device_handle = proxy.fifo_device_handle()

    kernel = MscclppKernel(
-        "proxy",
-        my_rank=group.my_rank,
-        nranks=group.nranks,
-        semaphore_or_channels=list_sem,
-        fifo=fifo_device_handle,
+        "proxy", my_rank=group.my_rank, nranks=group.nranks, semaphore_or_channels=list_sem, fifo=fifo_device_handle
    )
    proxy.start()
    group.barrier()
@@ -432,12 +391,7 @@ def test_proxy(
@pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20]])
@pytest.mark.parametrize("transport", ["NVLink", "IB"])
@pytest.mark.parametrize("use_packet", [False, True])
-def test_simple_proxy_channel(
-    mpi_group: MpiGroup,
-    nelem: int,
-    transport: str,
-    use_packet: bool,
-):
+def test_simple_proxy_channel(mpi_group: MpiGroup, nelem: int, transport: str, use_packet: bool):
    group, connections = create_and_connect(mpi_group, transport)

    memory = cp.zeros(nelem, dtype=cp.int32)
--- a/src/core.cc
+++ b/src/core.cc
@@ -2,11 +2,18 @@
 // Licensed under the MIT license.

 #include <mscclpp/core.hpp>
+#include <sstream>

 #include "api.h"

 namespace mscclpp {

+MSCCLPP_API_CPP std::string version() {
+  std::stringstream ss;
+  ss << MSCCLPP_MAJOR << "." << MSCCLPP_MINOR << "." << MSCCLPP_PATCH;
+  return ss.str();
+}
+
 MSCCLPP_API_CPP TransportFlags::TransportFlags(Transport transport)
    : detail::TransportFlagsBase(1 << static_cast<size_t>(transport)) {}

--- a/src/include/communicator.hpp
+++ b/src/include/communicator.hpp
@@ -1,8 +1,8 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.

-#ifndef MSCCL_COMMUNICATOR_HPP_
-#define MSCCL_COMMUNICATOR_HPP_
+#ifndef MSCCLPP_COMMUNICATOR_HPP_
+#define MSCCLPP_COMMUNICATOR_HPP_

 #include <memory>
 #include <mscclpp/core.hpp>
@@ -31,4 +31,4 @@ struct Communicator::Impl {

 }  // namespace mscclpp

-#endif  // MSCCL_COMMUNICATOR_HPP_
+#endif  // MSCCLPP_COMMUNICATOR_HPP_
--- a/src/include/context.hpp
+++ b/src/include/context.hpp
@@ -1,8 +1,8 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.

-#ifndef MSCCL_CONTEXT_HPP_
-#define MSCCL_CONTEXT_HPP_
+#ifndef MSCCLPP_CONTEXT_HPP_
+#define MSCCLPP_CONTEXT_HPP_

 #include <mscclpp/core.hpp>
 #include <mscclpp/cuda_utils.hpp>
@@ -25,4 +25,4 @@ struct Context::Impl {

 }  // namespace mscclpp

-#endif  // MSCCL_CONTEXT_HPP_
+#endif  // MSCCLPP_CONTEXT_HPP_
--- a/src/include/endpoint.hpp
+++ b/src/include/endpoint.hpp
@@ -1,8 +1,8 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.

-#ifndef MSCCL_ENDPOINT_HPP_
-#define MSCCL_ENDPOINT_HPP_
+#ifndef MSCCLPP_ENDPOINT_HPP_
+#define MSCCLPP_ENDPOINT_HPP_

 #include <mscclpp/core.hpp>
 #include <vector>
@@ -26,4 +26,4 @@ struct Endpoint::Impl {

 }  // namespace mscclpp

-#endif  // MSCCL_ENDPOINT_HPP_
+#endif  // MSCCLPP_ENDPOINT_HPP_
--- a/test/mscclpp-test/check_perf_result.py
+++ b/test/mscclpp-test/check_perf_result.py
@@ -16,17 +16,9 @@ def load_perf_file(perf_fine: str) -> dict:
                "time": data["time"],
            }
            if "target" in data:
-                res[
-                    (
-                        data["name"],
-                        data["kernel"],
-                        data["ranks"],
-                        data["ranksPerNode"],
-                        data["size"],
-                    )
-                ][
+                res[(data["name"], data["kernel"], data["ranks"], data["ranksPerNode"], data["size"])]["target"] = data[
                    "target"
-                ] = data["target"]
+                ]
    return res