From 9ec26fa4d11325ca33dd4dca83b99dee9146e6bf Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Mon, 4 May 2026 15:11:47 -0700 Subject: [PATCH] Reset GPU tokens before reuse (#795) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes a token-reuse bug in `TokenPool` that's independent of MNNVL. ## Bug `TokenPool` hands out 8-byte device-memory slots used as device-semaphore counters. The deleter only cleared the bitmap — the underlying GPU memory was left as-is. When a token was freed and later re-allocated, the new semaphore inherited the previous counter value instead of starting at 0, breaking subsequent `signal()/wait()` math. ## Fix * Add a synchronous `gpuMemset` host helper (mirrors `gpuMemcpy` / `gpuMemcpyAsync`). * Zero the slot inside the `TokenPool` deleter so recycled tokens hand out a clean counter. The very-first allocation is already zeroed by `gpuCallocPhysical` (`src/core/gpu_utils.cc:227-228`), so first-time tokens are also clean — the deleter only has to handle the recycle case. ## Notes * Public wrapper is named `mscclpp::gpuMemset` (not `mscclpp::memset`) for symmetry with `gpuMemcpy` and to avoid shadowing `std::memset` in TUs that pull the namespace in. * Zeroing happens on release rather than acquire so the cost is paid in the typically less perf-sensitive teardown path. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --------- Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- include/mscclpp/gpu_utils.hpp | 7 +++++++ src/core/gpu_utils.cc | 7 +++++++ src/core/utils_internal.cc | 3 +++ 3 files changed, 17 insertions(+) diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp index ecd13c47..b079e0fd 100644 --- a/include/mscclpp/gpu_utils.hpp +++ b/include/mscclpp/gpu_utils.hpp @@ -165,6 +165,7 @@ void gpuFreePhysical(void* ptr); void gpuMemcpyAsync(void* dst, const void* src, size_t bytes, cudaStream_t stream, cudaMemcpyKind kind = cudaMemcpyDefault); void gpuMemcpy(void* dst, const void* src, size_t bytes, cudaMemcpyKind kind = cudaMemcpyDefault); +void gpuMemset(void* ptr, int value, size_t bytes); /// A template function that allocates memory while ensuring that the memory will be freed when the returned object is /// destroyed. @@ -300,6 +301,12 @@ void gpuMemcpy(T* dst, const T* src, size_t nelems, cudaMemcpyKind kind = cudaMe detail::gpuMemcpy(dst, src, nelems * sizeof(T), kind); } +/// Sets `bytes` of memory at `ptr` to `value` synchronously. +/// @param ptr Destination address. +/// @param value Value to set (interpreted as unsigned char per CUDA semantics). +/// @param bytes Number of bytes to set. +inline void gpuMemset(void* ptr, int value, size_t bytes) { detail::gpuMemset(ptr, value, bytes); } + /// Check if NVLink SHARP (NVLS) is supported. /// /// @return True if NVLink SHARP (NVLS) is supported, false otherwise. diff --git a/src/core/gpu_utils.cc b/src/core/gpu_utils.cc index 09d5025d..1ce61322 100644 --- a/src/core/gpu_utils.cc +++ b/src/core/gpu_utils.cc @@ -267,6 +267,13 @@ void gpuMemcpy(void* dst, const void* src, size_t bytes, cudaMemcpyKind kind) { MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream)); } +void gpuMemset(void* ptr, int value, size_t bytes) { + AvoidCudaGraphCaptureGuard cgcGuard; + CudaStreamWithFlags stream(cudaStreamNonBlocking); + MSCCLPP_CUDATHROW(cudaMemsetAsync(ptr, value, bytes, stream)); + MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream)); +} + } // namespace detail bool isNvlsSupported() { diff --git a/src/core/utils_internal.cc b/src/core/utils_internal.cc index 9504a52c..8cc55430 100644 --- a/src/core/utils_internal.cc +++ b/src/core/utils_internal.cc @@ -248,6 +248,9 @@ TokenPool::TokenPool(size_t nToken) : nToken_(nToken) { std::shared_ptr TokenPool::getToken() { auto deleter = [self = shared_from_this()](uint64_t* token) { + // Zero the slot on release so the next allocator hands out a clean + // semaphore counter (matches a freshly-allocated slot). + mscclpp::gpuMemset(token, 0, sizeof(uint64_t)); size_t index = (token - self->baseAddr_) / UINT64_WIDTH; size_t bit = (token - self->baseAddr_) % UINT64_WIDTH; uint64_t mask = 1UL << bit;