Reset GPU tokens before reuse (#795)

Fixes a token-reuse bug in `TokenPool` that's independent of MNNVL.

## Bug

`TokenPool` hands out 8-byte device-memory slots used as
device-semaphore counters. The deleter only cleared the bitmap — the
underlying GPU memory was left as-is. When a token was freed and later
re-allocated, the new semaphore inherited the previous counter value
instead of starting at 0, breaking subsequent `signal()/wait()` math.

## Fix

* Add a synchronous `gpuMemset` host helper (mirrors `gpuMemcpy` /
`gpuMemcpyAsync`).
* Zero the slot inside the `TokenPool` deleter so recycled tokens hand
out a clean counter. The very-first allocation is already zeroed by
`gpuCallocPhysical` (`src/core/gpu_utils.cc:227-228`), so first-time
tokens are also clean — the deleter only has to handle the recycle case.

## Notes

* Public wrapper is named `mscclpp::gpuMemset` (not `mscclpp::memset`)
for symmetry with `gpuMemcpy` and to avoid shadowing `std::memset` in
TUs that pull the namespace in.
* Zeroing happens on release rather than acquire so the cost is paid in
the typically less perf-sensitive teardown path.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

---------

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
Binyang Li
2026-05-04 15:11:47 -07:00
committed by GitHub
parent 2c52937b26
commit 9ec26fa4d1
3 changed files with 17 additions and 0 deletions

View File

@@ -165,6 +165,7 @@ void gpuFreePhysical(void* ptr);
void gpuMemcpyAsync(void* dst, const void* src, size_t bytes, cudaStream_t stream,
cudaMemcpyKind kind = cudaMemcpyDefault);
void gpuMemcpy(void* dst, const void* src, size_t bytes, cudaMemcpyKind kind = cudaMemcpyDefault);
void gpuMemset(void* ptr, int value, size_t bytes);
/// A template function that allocates memory while ensuring that the memory will be freed when the returned object is
/// destroyed.
@@ -300,6 +301,12 @@ void gpuMemcpy(T* dst, const T* src, size_t nelems, cudaMemcpyKind kind = cudaMe
detail::gpuMemcpy(dst, src, nelems * sizeof(T), kind);
}
/// Sets `bytes` of memory at `ptr` to `value` synchronously.
/// @param ptr Destination address.
/// @param value Value to set (interpreted as unsigned char per CUDA semantics).
/// @param bytes Number of bytes to set.
inline void gpuMemset(void* ptr, int value, size_t bytes) { detail::gpuMemset(ptr, value, bytes); }
/// Check if NVLink SHARP (NVLS) is supported.
///
/// @return True if NVLink SHARP (NVLS) is supported, false otherwise.

View File

@@ -267,6 +267,13 @@ void gpuMemcpy(void* dst, const void* src, size_t bytes, cudaMemcpyKind kind) {
MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
}
void gpuMemset(void* ptr, int value, size_t bytes) {
AvoidCudaGraphCaptureGuard cgcGuard;
CudaStreamWithFlags stream(cudaStreamNonBlocking);
MSCCLPP_CUDATHROW(cudaMemsetAsync(ptr, value, bytes, stream));
MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
}
} // namespace detail
bool isNvlsSupported() {

View File

@@ -248,6 +248,9 @@ TokenPool::TokenPool(size_t nToken) : nToken_(nToken) {
std::shared_ptr<uint64_t> TokenPool::getToken() {
auto deleter = [self = shared_from_this()](uint64_t* token) {
// Zero the slot on release so the next allocator hands out a clean
// semaphore counter (matches a freshly-allocated slot).
mscclpp::gpuMemset(token, 0, sizeof(uint64_t));
size_t index = (token - self->baseAddr_) / UINT64_WIDTH;
size_t bit = (token - self->baseAddr_) % UINT64_WIDTH;
uint64_t mask = 1UL << bit;