mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-11 17:00:22 +00:00
## Summary - Replace the two-step `signal()` implementation (`incOutbound()` + `atomicStore()`) with a single fire-and-forget PTX `red.release.sys.global.add.u64` instruction - This eliminates one local atomic fetch-add and replaces a remote store with a remote atomic add that has no return value — more efficient on both NVIDIA (PTX `red`) and AMD (compiler optimizes `(void)fetch_add` to fire-and-forget `flat_atomic_add_x2`) - Add a C++ perf test (`PERF_TEST`) in `mp_unit` for signal+wait ping-pong latency ### Performance (H100, 2 ranks, signal+wait round-trip) ``` SemaphorePerfTest.SignalPingPong: Store-based (old): 2.595 us/iter Red-based (new): 2.345 us/iter Speedup: 1.11x ``` ## Test plan - [x] Builds successfully (`make mp_unit_tests`) - [x] `mpirun -np 2 ./build/bin/mp_unit_tests --filter "SemaphorePerfTest"` — 1.11x speedup 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
52 lines
2.7 KiB
C++
52 lines
2.7 KiB
C++
// Copyright (c) Microsoft Corporation.
|
|
// Licensed under the MIT license.
|
|
|
|
#include <nanobind/nanobind.h>
|
|
#include <nanobind/stl/shared_ptr.h>
|
|
|
|
#include <mscclpp/semaphore.hpp>
|
|
|
|
namespace nb = nanobind;
|
|
using namespace mscclpp;
|
|
|
|
void register_semaphore(nb::module_& m) {
|
|
nb::class_<Host2DeviceSemaphore> host2DeviceSemaphore(m, "CppHost2DeviceSemaphore");
|
|
host2DeviceSemaphore.def(nb::init<const Semaphore&>(), nb::arg("semaphore"))
|
|
.def(nb::init<Communicator&, const Connection&>(), nb::arg("communicator"), nb::arg("connection"))
|
|
.def("connection", &Host2DeviceSemaphore::connection)
|
|
.def("signal", &Host2DeviceSemaphore::signal)
|
|
.def("device_handle", &Host2DeviceSemaphore::deviceHandle);
|
|
|
|
nb::class_<Host2DeviceSemaphore::DeviceHandle>(host2DeviceSemaphore, "DeviceHandle")
|
|
.def(nb::init<>())
|
|
.def_rw("inbound_token", &Host2DeviceSemaphore::DeviceHandle::inboundToken)
|
|
.def_rw("expected_inbound_token", &Host2DeviceSemaphore::DeviceHandle::expectedInboundToken)
|
|
.def_prop_ro("raw", [](const Host2DeviceSemaphore::DeviceHandle& self) -> nb::bytes {
|
|
return nb::bytes(reinterpret_cast<const char*>(&self), sizeof(self));
|
|
});
|
|
|
|
nb::class_<Host2HostSemaphore>(m, "CppHost2HostSemaphore")
|
|
.def(nb::init<const Semaphore&>(), nb::arg("semaphore"))
|
|
.def(nb::init<Communicator&, const Connection&>(), nb::arg("communicator"), nb::arg("connection"))
|
|
.def("connection", &Host2HostSemaphore::connection)
|
|
.def("signal", &Host2HostSemaphore::signal)
|
|
.def("poll", &Host2HostSemaphore::poll)
|
|
.def("wait", &Host2HostSemaphore::wait, nb::call_guard<nb::gil_scoped_release>(),
|
|
nb::arg("max_spin_count") = 10000000);
|
|
|
|
nb::class_<MemoryDevice2DeviceSemaphore> memoryDevice2DeviceSemaphore(m, "CppMemoryDevice2DeviceSemaphore");
|
|
memoryDevice2DeviceSemaphore.def(nb::init<const Semaphore&>(), nb::arg("semaphore"))
|
|
.def(nb::init<Communicator&, const Connection&>(), nb::arg("communicator"), nb::arg("connection"))
|
|
.def("connection", &MemoryDevice2DeviceSemaphore::connection)
|
|
.def("device_handle", &MemoryDevice2DeviceSemaphore::deviceHandle);
|
|
|
|
nb::class_<MemoryDevice2DeviceSemaphore::DeviceHandle>(memoryDevice2DeviceSemaphore, "DeviceHandle")
|
|
.def(nb::init<>())
|
|
.def_rw("inbound_token", &MemoryDevice2DeviceSemaphore::DeviceHandle::inboundToken)
|
|
.def_rw("remote_inbound_token", &MemoryDevice2DeviceSemaphore::DeviceHandle::remoteInboundToken)
|
|
.def_rw("expected_inbound_token", &MemoryDevice2DeviceSemaphore::DeviceHandle::expectedInboundToken)
|
|
.def_prop_ro("raw", [](const MemoryDevice2DeviceSemaphore::DeviceHandle& self) -> nb::bytes {
|
|
return nb::bytes(reinterpret_cast<const char*>(&self), sizeof(self));
|
|
});
|
|
}
|