mirror of
https://github.com/microsoft/mscclpp.git
synced 2026-05-26 08:01:00 +00:00
Fix & improve perf for ROCm (#232)
Co-authored-by: Binyang Li <binyli@microsoft.com>
This commit is contained in:
@@ -4,20 +4,20 @@
|
||||
#ifndef MSCCLPP_DEVICE_HPP_
|
||||
#define MSCCLPP_DEVICE_HPP_
|
||||
|
||||
#if defined(__HIP_PLATFORM_AMD__)
|
||||
#if defined(__HIP_PLATFORM_AMD__) && (__HIP_PLATFORM_AMD__ == 1)
|
||||
#include <hip/hip_runtime.h>
|
||||
#endif // defined(__HIP_PLATFORM_AMD__)
|
||||
#endif // defined(__HIP_PLATFORM_AMD__) && (__HIP_PLATFORM_AMD__ == 1)
|
||||
|
||||
#if (defined(__NVCC__) || defined(__HIP_PLATFORM_AMD__))
|
||||
|
||||
#define MSCCLPP_DEVICE_COMPILE
|
||||
#define MSCCLPP_DEVICE_INLINE __forceinline__ __device__
|
||||
#define MSCCLPP_HOST_DEVICE_INLINE __forceinline__ __host__ __device__
|
||||
#if defined(__HIP_PLATFORM_AMD__)
|
||||
#if defined(__HIP_PLATFORM_AMD__) && (__HIP_PLATFORM_AMD__ == 1)
|
||||
#define MSCCLPP_DEVICE_HIP
|
||||
#else // !defined(__HIP_PLATFORM_AMD__)
|
||||
#else // !(defined(__HIP_PLATFORM_AMD__) && (__HIP_PLATFORM_AMD__ == 1))
|
||||
#define MSCCLPP_DEVICE_CUDA
|
||||
#endif // !defined(__HIP_PLATFORM_AMD__)
|
||||
#endif // !(defined(__HIP_PLATFORM_AMD__) && (__HIP_PLATFORM_AMD__ == 1))
|
||||
|
||||
#else // !(defined(__NVCC__) || defined(__HIP_PLATFORM_AMD__))
|
||||
|
||||
|
||||
@@ -70,9 +70,9 @@ struct FifoDeviceHandle {
|
||||
#if defined(MSCCLPP_DEVICE_CUDA)
|
||||
asm volatile("st.global.relaxed.sys.v2.u64 [%0], {%1,%2};" ::"l"(triggerPtr), "l"(trigger.fst), "l"(trigger.snd));
|
||||
#else // !defined(MSCCLPP_DEVICE_CUDA)
|
||||
// TODO: both atomic and clang built-ins are buggy here
|
||||
triggerPtr->fst = trigger.fst;
|
||||
triggerPtr->snd = trigger.snd;
|
||||
// store snd no later than fst.
|
||||
atomicStore(&(triggerPtr->snd), trigger.snd, memoryOrderRelaxed);
|
||||
atomicStore(&(triggerPtr->fst), trigger.fst, memoryOrderRelaxed);
|
||||
#endif // !defined(MSCCLPP_DEVICE_CUDA)
|
||||
|
||||
return curFifoHead;
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
#ifndef MSCCLPP_GPU_HPP_
|
||||
#define MSCCLPP_GPU_HPP_
|
||||
|
||||
#if defined(__HIP_PLATFORM_AMD__)
|
||||
#if defined(__HIP_PLATFORM_AMD__) && (__HIP_PLATFORM_AMD__ == 1)
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
|
||||
@@ -41,7 +41,6 @@ union alignas(16) LLPacket {
|
||||
#else // !defined(MSCCLPP_DEVICE_CUDA)
|
||||
uint4 reg = make_uint4(val1, flag, val2, flag);
|
||||
ulonglong2* p = reinterpret_cast<ulonglong2*>(®);
|
||||
// TODO: clang built-ins are buggy here
|
||||
atomicStore(&(raw_.x), p->x, memoryOrderRelaxed);
|
||||
atomicStore(&(raw_.y), p->y, memoryOrderRelaxed);
|
||||
#endif
|
||||
@@ -65,7 +64,6 @@ union alignas(16) LLPacket {
|
||||
return (flag1 != flag) || (flag2 != flag);
|
||||
#else // !defined(MSCCLPP_DEVICE_CUDA)
|
||||
ulonglong2 reg;
|
||||
// TODO: clang built-ins are buggy here
|
||||
reg.x = atomicLoad(&(raw_.x), memoryOrderRelaxed);
|
||||
reg.y = atomicLoad(&(raw_.y), memoryOrderRelaxed);
|
||||
uint4* ptr = reinterpret_cast<uint4*>(®);
|
||||
|
||||
@@ -17,7 +17,7 @@ struct Timer {
|
||||
|
||||
~Timer();
|
||||
|
||||
/// Returns the elapsed time in milliseconds.
|
||||
/// Returns the elapsed time in microseconds.
|
||||
int64_t elapsed() const;
|
||||
|
||||
void set(int timeout);
|
||||
|
||||
Reference in New Issue
Block a user