From b5a48f836cab9d72941836a85a18ab586c251e3b Mon Sep 17 00:00:00 2001 From: Ziyue Yang Date: Tue, 2 Jul 2024 19:36:48 +0800 Subject: [PATCH] Separate NPKit CPU timestamp access from different blocks for AMD platform (#321) Reference: https://github.com/ROCm/rccl/pull/1229 --- include/mscclpp/npkit/npkit.hpp | 10 ++++++++++ src/executor/executor.cc | 8 ++++++++ src/include/execution_kernel.hpp | 7 ++++++- src/npkit/npkit.cc | 24 +++++++++++++++++++++++- 4 files changed, 47 insertions(+), 2 deletions(-) diff --git a/include/mscclpp/npkit/npkit.hpp b/include/mscclpp/npkit/npkit.hpp index 77805cf6..24caf360 100644 --- a/include/mscclpp/npkit/npkit.hpp +++ b/include/mscclpp/npkit/npkit.hpp @@ -14,6 +14,12 @@ #if defined(__HIP_PLATFORM_AMD__) #define NPKIT_GET_GPU_TIMESTAMP wall_clock64 +#define NPKIT_MAX_NUM_GPU_THREADBLOCKS 64 +#define NPKIT_CPU_TIMESTAMP_SLOT_SIZE 128 +#define NPKIT_LOAD_CPU_TIMESTAMP_PER_BLOCK(buf, blk) *(buf + NPKIT_CPU_TIMESTAMP_SLOT_SIZE * blk / sizeof(uint64_t)) +#define NPKIT_STORE_CPU_TIMESTAMP_PER_BLOCK(buf, val, blk) \ + *reinterpret_cast(buf + NPKIT_CPU_TIMESTAMP_SLOT_SIZE * blk / sizeof(uint64_t)) = val + #else #define NPKIT_GET_GPU_TIMESTAMP clock64 #endif @@ -85,7 +91,11 @@ class NpKit { static uint64_t rank_; +#if defined(__HIP_PLATFORM_AMD__) + static mscclpp::UniqueCudaHostPtr cpu_timestamp_; +#else static mscclpp::UniqueCudaHostPtr cpu_timestamp_; +#endif static std::unique_ptr cpu_timestamp_update_thread_; static volatile bool cpu_timestamp_update_thread_should_stop_; }; diff --git a/src/executor/executor.cc b/src/executor/executor.cc index 0d5e75f0..62d749d0 100644 --- a/src/executor/executor.cc +++ b/src/executor/executor.cc @@ -262,6 +262,14 @@ struct Executor::Impl { static uint32_t flag = 0; int nthreadblocks = context.deviceExecutionPlans.size(); #if defined(ENABLE_NPKIT) +#if defined(__HIP_PLATFORM_AMD__) + if (nthreadblocks > NPKIT_MAX_NUM_GPU_THREADBLOCKS) { + throw Error("Executor plan launching " + std::to_string(nthreadblocks) + + " thread blocks, exceeding NPKit support (" + std::to_string(NPKIT_MAX_NUM_GPU_THREADBLOCKS) + + ")", + ErrorCode::ExecutorError); + } +#endif size_t sharedMemSize = sizeof(DeviceExecutionPlan) + NPKIT_SHM_NUM_EVENTS * sizeof(NpKitEvent); #else size_t sharedMemSize = sizeof(DeviceExecutionPlan); diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp index 124aef72..e781daa3 100644 --- a/src/include/execution_kernel.hpp +++ b/src/include/execution_kernel.hpp @@ -364,7 +364,12 @@ __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* inpu DeviceHandle* proxyChannels = localPlan->channels.proxyChannels; #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU) - NpKit::CollectGpuEventShm(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp, event_buffer, &event_buffer_head); +#if defined(MSCCLPP_DEVICE_HIP) + NpKit::CollectGpuEventShm(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, NPKIT_LOAD_CPU_TIMESTAMP_PER_BLOCK(cpuTimestamp, bid), +#else + NpKit::CollectGpuEventShm(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp, +#endif + event_buffer, &event_buffer_head); #endif #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU) diff --git a/src/npkit/npkit.cc b/src/npkit/npkit.cc index a0fe2938..77db6b90 100644 --- a/src/npkit/npkit.cc +++ b/src/npkit/npkit.cc @@ -18,7 +18,11 @@ std::vector> NpKit::cpu_event_buffers_; mscclpp::UniqueCudaPtr NpKit::gpu_collect_contexts_; std::unique_ptr NpKit::cpu_collect_contexts_; +#if defined(__HIP_PLATFORM_AMD__) +mscclpp::UniqueCudaHostPtr NpKit::cpu_timestamp_; +#else mscclpp::UniqueCudaHostPtr NpKit::cpu_timestamp_; +#endif std::unique_ptr NpKit::cpu_timestamp_update_thread_; volatile bool NpKit::cpu_timestamp_update_thread_should_stop_ = false; @@ -26,10 +30,18 @@ void NpKit::CpuTimestampUpdateThread() { uint64_t init_system_clock = std::chrono::system_clock::now().time_since_epoch().count(); uint64_t init_steady_clock = std::chrono::steady_clock::now().time_since_epoch().count(); uint64_t curr_steady_clock = 0; - volatile uint64_t* volatile_cpu_timestamp_ = cpu_timestamp_.get(); while (!cpu_timestamp_update_thread_should_stop_) { +#if defined(__HIP_PLATFORM_AMD__) + for (int i = 0; i < NPKIT_MAX_NUM_GPU_THREADBLOCKS; i++) { + curr_steady_clock = std::chrono::steady_clock::now().time_since_epoch().count(); + NPKIT_STORE_CPU_TIMESTAMP_PER_BLOCK(cpu_timestamp_.get(), + init_system_clock + (curr_steady_clock - init_steady_clock), i); + } +#else curr_steady_clock = std::chrono::steady_clock::now().time_since_epoch().count(); + volatile uint64_t* volatile_cpu_timestamp_ = cpu_timestamp_.get(); *volatile_cpu_timestamp_ = init_system_clock + (curr_steady_clock - init_steady_clock); +#endif } } @@ -55,10 +67,20 @@ void NpKit::Init(int rank) { cpu_collect_contexts_[i] = ctx; } +#if defined(__HIP_PLATFORM_AMD__) + // Init timestamp. Allocates MAXCHANNELS*128 bytes buffer for GPU + cpu_timestamp_ = mscclpp::makeUniqueCudaHost(NPKIT_MAX_NUM_GPU_THREADBLOCKS * + NPKIT_CPU_TIMESTAMP_SLOT_SIZE / sizeof(uint64_t)); + for (int i = 0; i < NPKIT_MAX_NUM_GPU_THREADBLOCKS; i++) { + NPKIT_STORE_CPU_TIMESTAMP_PER_BLOCK(cpu_timestamp_.get(), + std::chrono::system_clock::now().time_since_epoch().count(), i); + } +#else // Init timestamp cpu_timestamp_ = mscclpp::makeUniqueCudaHost(); volatile uint64_t* volatile_cpu_timestamp = cpu_timestamp_.get(); *volatile_cpu_timestamp = std::chrono::system_clock::now().time_since_epoch().count(); +#endif cpu_timestamp_update_thread_should_stop_ = false; cpu_timestamp_update_thread_ = std::make_unique(CpuTimestampUpdateThread); #else