Fix & improve perf for ROCm (#232)

Co-authored-by: Binyang Li <binyli@microsoft.com>
This commit is contained in:
Changho Hwang
2023-12-18 11:30:08 +08:00
committed by GitHub
parent 5a9998bfba
commit 5ff8bc5ef2
9 changed files with 115 additions and 15 deletions

View File

@@ -74,7 +74,7 @@ __device__ void localAllGather(DeviceHandle<mscclpp::SimpleProxyChannel> proxyCh
if ((remoteRank % nranksPerNode) == ((rank - i + nranksPerNode) % nranksPerNode)) {
if ((threadIdx.x % 32) == 0) proxyChan.wait();
}
#if defined(__HIP_PLATFORM_AMD__)
#if defined(__HIP_PLATFORM_AMD__) && (__HIP_PLATFORM_AMD__ == 1)
// NOTE: we actually need a group barrier here for better performance, but __syncthreads() is still correct.
__syncthreads();
#else