mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-24 14:54:51 +00:00
Reset GPU tokens before reuse (#795)
Fixes a token-reuse bug in `TokenPool` that's independent of MNNVL. ## Bug `TokenPool` hands out 8-byte device-memory slots used as device-semaphore counters. The deleter only cleared the bitmap — the underlying GPU memory was left as-is. When a token was freed and later re-allocated, the new semaphore inherited the previous counter value instead of starting at 0, breaking subsequent `signal()/wait()` math. ## Fix * Add a synchronous `gpuMemset` host helper (mirrors `gpuMemcpy` / `gpuMemcpyAsync`). * Zero the slot inside the `TokenPool` deleter so recycled tokens hand out a clean counter. The very-first allocation is already zeroed by `gpuCallocPhysical` (`src/core/gpu_utils.cc:227-228`), so first-time tokens are also clean — the deleter only has to handle the recycle case. ## Notes * Public wrapper is named `mscclpp::gpuMemset` (not `mscclpp::memset`) for symmetry with `gpuMemcpy` and to avoid shadowing `std::memset` in TUs that pull the namespace in. * Zeroing happens on release rather than acquire so the cost is paid in the typically less perf-sensitive teardown path. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --------- Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -267,6 +267,13 @@ void gpuMemcpy(void* dst, const void* src, size_t bytes, cudaMemcpyKind kind) {
|
||||
MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
|
||||
}
|
||||
|
||||
void gpuMemset(void* ptr, int value, size_t bytes) {
|
||||
AvoidCudaGraphCaptureGuard cgcGuard;
|
||||
CudaStreamWithFlags stream(cudaStreamNonBlocking);
|
||||
MSCCLPP_CUDATHROW(cudaMemsetAsync(ptr, value, bytes, stream));
|
||||
MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
bool isNvlsSupported() {
|
||||
|
||||
@@ -248,6 +248,9 @@ TokenPool::TokenPool(size_t nToken) : nToken_(nToken) {
|
||||
|
||||
std::shared_ptr<uint64_t> TokenPool::getToken() {
|
||||
auto deleter = [self = shared_from_this()](uint64_t* token) {
|
||||
// Zero the slot on release so the next allocator hands out a clean
|
||||
// semaphore counter (matches a freshly-allocated slot).
|
||||
mscclpp::gpuMemset(token, 0, sizeof(uint64_t));
|
||||
size_t index = (token - self->baseAddr_) / UINT64_WIDTH;
|
||||
size_t bit = (token - self->baseAddr_) % UINT64_WIDTH;
|
||||
uint64_t mask = 1UL << bit;
|
||||
|
||||
Reference in New Issue
Block a user