mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-12 01:10:22 +00:00
Reset GPU tokens before reuse
Clear recycled TokenPool entries before handing them out so device-to-device semaphores start from a clean counter value. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -165,6 +165,7 @@ void gpuFreePhysical(void* ptr);
|
|||||||
void gpuMemcpyAsync(void* dst, const void* src, size_t bytes, cudaStream_t stream,
|
void gpuMemcpyAsync(void* dst, const void* src, size_t bytes, cudaStream_t stream,
|
||||||
cudaMemcpyKind kind = cudaMemcpyDefault);
|
cudaMemcpyKind kind = cudaMemcpyDefault);
|
||||||
void gpuMemcpy(void* dst, const void* src, size_t bytes, cudaMemcpyKind kind = cudaMemcpyDefault);
|
void gpuMemcpy(void* dst, const void* src, size_t bytes, cudaMemcpyKind kind = cudaMemcpyDefault);
|
||||||
|
void gpuMemset(void* ptr, int value, size_t bytes);
|
||||||
|
|
||||||
/// A template function that allocates memory while ensuring that the memory will be freed when the returned object is
|
/// A template function that allocates memory while ensuring that the memory will be freed when the returned object is
|
||||||
/// destroyed.
|
/// destroyed.
|
||||||
@@ -300,6 +301,8 @@ void gpuMemcpy(T* dst, const T* src, size_t nelems, cudaMemcpyKind kind = cudaMe
|
|||||||
detail::gpuMemcpy(dst, src, nelems * sizeof(T), kind);
|
detail::gpuMemcpy(dst, src, nelems * sizeof(T), kind);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline void memset(void* ptr, int value, size_t bytes) { detail::gpuMemset(ptr, value, bytes); }
|
||||||
|
|
||||||
/// Check if NVLink SHARP (NVLS) is supported.
|
/// Check if NVLink SHARP (NVLS) is supported.
|
||||||
///
|
///
|
||||||
/// @return True if NVLink SHARP (NVLS) is supported, false otherwise.
|
/// @return True if NVLink SHARP (NVLS) is supported, false otherwise.
|
||||||
|
|||||||
@@ -267,6 +267,13 @@ void gpuMemcpy(void* dst, const void* src, size_t bytes, cudaMemcpyKind kind) {
|
|||||||
MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
|
MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void gpuMemset(void* ptr, int value, size_t bytes) {
|
||||||
|
AvoidCudaGraphCaptureGuard cgcGuard;
|
||||||
|
CudaStreamWithFlags stream(cudaStreamNonBlocking);
|
||||||
|
MSCCLPP_CUDATHROW(cudaMemsetAsync(ptr, value, bytes, stream));
|
||||||
|
MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace detail
|
} // namespace detail
|
||||||
|
|
||||||
bool isNvlsSupported() {
|
bool isNvlsSupported() {
|
||||||
|
|||||||
@@ -263,8 +263,10 @@ std::shared_ptr<uint64_t> TokenPool::getToken() {
|
|||||||
for (int bit = 0; bit < UINT64_WIDTH; bit++) {
|
for (int bit = 0; bit < UINT64_WIDTH; bit++) {
|
||||||
if (holes & (1UL << bit)) {
|
if (holes & (1UL << bit)) {
|
||||||
allocationMap_[i].set(bit);
|
allocationMap_[i].set(bit);
|
||||||
INFO(MSCCLPP_ALLOC, "TokenPool allocated token at addr %p", baseAddr_ + i * UINT64_WIDTH + bit);
|
uint64_t* token = baseAddr_ + i * UINT64_WIDTH + bit;
|
||||||
return std::shared_ptr<uint64_t>(baseAddr_ + i * UINT64_WIDTH + bit, deleter);
|
mscclpp::memset(token, 0, sizeof(uint64_t));
|
||||||
|
INFO(MSCCLPP_ALLOC, "TokenPool allocated token at addr %p", token);
|
||||||
|
return std::shared_ptr<uint64_t>(token, deleter);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user