diff --git a/include/mscclpp/semaphore_device.hpp b/include/mscclpp/semaphore_device.hpp index bd47ced2..f1b01e89 100644 --- a/include/mscclpp/semaphore_device.hpp +++ b/include/mscclpp/semaphore_device.hpp @@ -77,7 +77,7 @@ struct MemoryDevice2DeviceSemaphoreDeviceHandle { /// Relaxed wait; no memory completion guarantee. Use it only for synchronizing execution, not data. MSCCLPP_DEVICE_INLINE void relaxedWait([[maybe_unused]] int64_t maxSpinCount = 100000000) { auto expected = incExpectedInbound(); - POLL_MAYBE_JAILBREAK((loadInbound() < expected), maxSpinCount); + POLL_MAYBE_JAILBREAK((loadInboundRelaxed() < expected), maxSpinCount); } /// Signal remote device, ensures prior memory ops complete. @@ -115,6 +115,12 @@ struct MemoryDevice2DeviceSemaphoreDeviceHandle { return atomicLoad(inboundToken, memoryOrderAcquire); } + /// Thread-safe read of inbound value without memory completion guarantee. + /// @return The inbound value. + MSCCLPP_DEVICE_INLINE uint64_t loadInboundRelaxed() { + return atomicLoad(inboundToken, memoryOrderRelaxed); + } + /// Thread-safe read of outbound value. /// @return The outbound value. MSCCLPP_DEVICE_INLINE uint64_t loadOutbound() {