mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-11 08:50:21 +00:00
Merge branch 'main' into caiorocha/4_nodes_allreduce
This commit is contained in:
@@ -165,6 +165,7 @@ void gpuFreePhysical(void* ptr);
|
||||
void gpuMemcpyAsync(void* dst, const void* src, size_t bytes, cudaStream_t stream,
|
||||
cudaMemcpyKind kind = cudaMemcpyDefault);
|
||||
void gpuMemcpy(void* dst, const void* src, size_t bytes, cudaMemcpyKind kind = cudaMemcpyDefault);
|
||||
void gpuMemset(void* ptr, int value, size_t bytes);
|
||||
|
||||
/// A template function that allocates memory while ensuring that the memory will be freed when the returned object is
|
||||
/// destroyed.
|
||||
@@ -300,6 +301,12 @@ void gpuMemcpy(T* dst, const T* src, size_t nelems, cudaMemcpyKind kind = cudaMe
|
||||
detail::gpuMemcpy(dst, src, nelems * sizeof(T), kind);
|
||||
}
|
||||
|
||||
/// Sets `bytes` of memory at `ptr` to `value` synchronously.
|
||||
/// @param ptr Destination address.
|
||||
/// @param value Value to set (interpreted as unsigned char per CUDA semantics).
|
||||
/// @param bytes Number of bytes to set.
|
||||
inline void gpuMemset(void* ptr, int value, size_t bytes) { detail::gpuMemset(ptr, value, bytes); }
|
||||
|
||||
/// Check if NVLink SHARP (NVLS) is supported.
|
||||
///
|
||||
/// @return True if NVLink SHARP (NVLS) is supported, false otherwise.
|
||||
|
||||
@@ -267,6 +267,13 @@ void gpuMemcpy(void* dst, const void* src, size_t bytes, cudaMemcpyKind kind) {
|
||||
MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
|
||||
}
|
||||
|
||||
void gpuMemset(void* ptr, int value, size_t bytes) {
|
||||
AvoidCudaGraphCaptureGuard cgcGuard;
|
||||
CudaStreamWithFlags stream(cudaStreamNonBlocking);
|
||||
MSCCLPP_CUDATHROW(cudaMemsetAsync(ptr, value, bytes, stream));
|
||||
MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
bool isNvlsSupported() {
|
||||
|
||||
@@ -248,6 +248,9 @@ TokenPool::TokenPool(size_t nToken) : nToken_(nToken) {
|
||||
|
||||
std::shared_ptr<uint64_t> TokenPool::getToken() {
|
||||
auto deleter = [self = shared_from_this()](uint64_t* token) {
|
||||
// Zero the slot on release so the next allocator hands out a clean
|
||||
// semaphore counter (matches a freshly-allocated slot).
|
||||
mscclpp::gpuMemset(token, 0, sizeof(uint64_t));
|
||||
size_t index = (token - self->baseAddr_) / UINT64_WIDTH;
|
||||
size_t bit = (token - self->baseAddr_) % UINT64_WIDTH;
|
||||
uint64_t mask = 1UL << bit;
|
||||
|
||||
Reference in New Issue
Block a user