mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-12 17:26:04 +00:00
Fix & improve perf for ROCm (#232)
Co-authored-by: Binyang Li <binyli@microsoft.com>
This commit is contained in:
@@ -74,7 +74,7 @@ __device__ void localAllGather(DeviceHandle<mscclpp::SimpleProxyChannel> proxyCh
|
||||
if ((remoteRank % nranksPerNode) == ((rank - i + nranksPerNode) % nranksPerNode)) {
|
||||
if ((threadIdx.x % 32) == 0) proxyChan.wait();
|
||||
}
|
||||
#if defined(__HIP_PLATFORM_AMD__)
|
||||
#if defined(__HIP_PLATFORM_AMD__) && (__HIP_PLATFORM_AMD__ == 1)
|
||||
// NOTE: we actually need a group barrier here for better performance, but __syncthreads() is still correct.
|
||||
__syncthreads();
|
||||
#else
|
||||
|
||||
Reference in New Issue
Block a user