Fix & improve perf for ROCm (#232)

Co-authored-by: Binyang Li <binyli@microsoft.com>
This commit is contained in:
Changho Hwang
2023-12-18 11:30:08 +08:00
committed by GitHub
parent 5a9998bfba
commit 5ff8bc5ef2
9 changed files with 115 additions and 15 deletions

View File

@@ -4,20 +4,20 @@
#ifndef MSCCLPP_DEVICE_HPP_
#define MSCCLPP_DEVICE_HPP_
#if defined(__HIP_PLATFORM_AMD__)
#if defined(__HIP_PLATFORM_AMD__) && (__HIP_PLATFORM_AMD__ == 1)
#include <hip/hip_runtime.h>
#endif // defined(__HIP_PLATFORM_AMD__)
#endif // defined(__HIP_PLATFORM_AMD__) && (__HIP_PLATFORM_AMD__ == 1)
#if (defined(__NVCC__) || defined(__HIP_PLATFORM_AMD__))
#define MSCCLPP_DEVICE_COMPILE
#define MSCCLPP_DEVICE_INLINE __forceinline__ __device__
#define MSCCLPP_HOST_DEVICE_INLINE __forceinline__ __host__ __device__
#if defined(__HIP_PLATFORM_AMD__)
#if defined(__HIP_PLATFORM_AMD__) && (__HIP_PLATFORM_AMD__ == 1)
#define MSCCLPP_DEVICE_HIP
#else // !defined(__HIP_PLATFORM_AMD__)
#else // !(defined(__HIP_PLATFORM_AMD__) && (__HIP_PLATFORM_AMD__ == 1))
#define MSCCLPP_DEVICE_CUDA
#endif // !defined(__HIP_PLATFORM_AMD__)
#endif // !(defined(__HIP_PLATFORM_AMD__) && (__HIP_PLATFORM_AMD__ == 1))
#else // !(defined(__NVCC__) || defined(__HIP_PLATFORM_AMD__))

View File

@@ -70,9 +70,9 @@ struct FifoDeviceHandle {
#if defined(MSCCLPP_DEVICE_CUDA)
asm volatile("st.global.relaxed.sys.v2.u64 [%0], {%1,%2};" ::"l"(triggerPtr), "l"(trigger.fst), "l"(trigger.snd));
#else // !defined(MSCCLPP_DEVICE_CUDA)
// TODO: both atomic and clang built-ins are buggy here
triggerPtr->fst = trigger.fst;
triggerPtr->snd = trigger.snd;
// store snd no later than fst.
atomicStore(&(triggerPtr->snd), trigger.snd, memoryOrderRelaxed);
atomicStore(&(triggerPtr->fst), trigger.fst, memoryOrderRelaxed);
#endif // !defined(MSCCLPP_DEVICE_CUDA)
return curFifoHead;

View File

@@ -4,7 +4,7 @@
#ifndef MSCCLPP_GPU_HPP_
#define MSCCLPP_GPU_HPP_
#if defined(__HIP_PLATFORM_AMD__)
#if defined(__HIP_PLATFORM_AMD__) && (__HIP_PLATFORM_AMD__ == 1)
#include <hip/hip_runtime.h>

View File

@@ -41,7 +41,6 @@ union alignas(16) LLPacket {
#else // !defined(MSCCLPP_DEVICE_CUDA)
uint4 reg = make_uint4(val1, flag, val2, flag);
ulonglong2* p = reinterpret_cast<ulonglong2*>(&reg);
// TODO: clang built-ins are buggy here
atomicStore(&(raw_.x), p->x, memoryOrderRelaxed);
atomicStore(&(raw_.y), p->y, memoryOrderRelaxed);
#endif
@@ -65,7 +64,6 @@ union alignas(16) LLPacket {
return (flag1 != flag) || (flag2 != flag);
#else // !defined(MSCCLPP_DEVICE_CUDA)
ulonglong2 reg;
// TODO: clang built-ins are buggy here
reg.x = atomicLoad(&(raw_.x), memoryOrderRelaxed);
reg.y = atomicLoad(&(raw_.y), memoryOrderRelaxed);
uint4* ptr = reinterpret_cast<uint4*>(&reg);

View File

@@ -17,7 +17,7 @@ struct Timer {
~Timer();
/// Returns the elapsed time in milliseconds.
/// Returns the elapsed time in microseconds.
int64_t elapsed() const;
void set(int timeout);