From b5a48f836cab9d72941836a85a18ab586c251e3b Mon Sep 17 00:00:00 2001
From: Ziyue Yang <ziyyang@microsoft.com>
Date: Tue, 2 Jul 2024 19:36:48 +0800
Subject: [PATCH] Separate NPKit CPU timestamp access from different blocks for
 AMD platform (#321)

Reference: https://github.com/ROCm/rccl/pull/1229
---
 include/mscclpp/npkit/npkit.hpp  | 10 ++++++++++
 src/executor/executor.cc         |  8 ++++++++
 src/include/execution_kernel.hpp |  7 ++++++-
 src/npkit/npkit.cc               | 24 +++++++++++++++++++++++-
 4 files changed, 47 insertions(+), 2 deletions(-)
diff --git a/include/mscclpp/npkit/npkit.hpp b/include/mscclpp/npkit/npkit.hpp
index 77805cf6..24caf360 100644
--- a/include/mscclpp/npkit/npkit.hpp
+++ b/include/mscclpp/npkit/npkit.hpp
@@ -14,6 +14,12 @@
 
 #if defined(__HIP_PLATFORM_AMD__)
 #define NPKIT_GET_GPU_TIMESTAMP wall_clock64
+#define NPKIT_MAX_NUM_GPU_THREADBLOCKS 64
+#define NPKIT_CPU_TIMESTAMP_SLOT_SIZE 128
+#define NPKIT_LOAD_CPU_TIMESTAMP_PER_BLOCK(buf, blk) *(buf + NPKIT_CPU_TIMESTAMP_SLOT_SIZE * blk / sizeof(uint64_t))
+#define NPKIT_STORE_CPU_TIMESTAMP_PER_BLOCK(buf, val, blk) \
+  *reinterpret_cast<volatile uint64_t*>(buf + NPKIT_CPU_TIMESTAMP_SLOT_SIZE * blk / sizeof(uint64_t)) = val
+
 #else
 #define NPKIT_GET_GPU_TIMESTAMP clock64
 #endif
@@ -85,7 +91,11 @@ class NpKit {
 
   static uint64_t rank_;
 
+#if defined(__HIP_PLATFORM_AMD__)
+  static mscclpp::UniqueCudaHostPtr<uint64_t[]> cpu_timestamp_;
+#else
   static mscclpp::UniqueCudaHostPtr<uint64_t> cpu_timestamp_;
+#endif
   static std::unique_ptr<std::thread> cpu_timestamp_update_thread_;
   static volatile bool cpu_timestamp_update_thread_should_stop_;
 };
diff --git a/src/executor/executor.cc b/src/executor/executor.cc
index 0d5e75f0..62d749d0 100644
--- a/src/executor/executor.cc
+++ b/src/executor/executor.cc
@@ -262,6 +262,14 @@ struct Executor::Impl {
     static uint32_t flag = 0;
     int nthreadblocks = context.deviceExecutionPlans.size();
 #if defined(ENABLE_NPKIT)
+#if defined(__HIP_PLATFORM_AMD__)
+    if (nthreadblocks > NPKIT_MAX_NUM_GPU_THREADBLOCKS) {
+      throw Error("Executor plan launching " + std::to_string(nthreadblocks) +
+                      " thread blocks, exceeding NPKit support (" + std::to_string(NPKIT_MAX_NUM_GPU_THREADBLOCKS) +
+                      ")",
+                  ErrorCode::ExecutorError);
+    }
+#endif
     size_t sharedMemSize = sizeof(DeviceExecutionPlan) + NPKIT_SHM_NUM_EVENTS * sizeof(NpKitEvent);
 #else
     size_t sharedMemSize = sizeof(DeviceExecutionPlan);
diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp
index 124aef72..e781daa3 100644
--- a/src/include/execution_kernel.hpp
+++ b/src/include/execution_kernel.hpp
@@ -364,7 +364,12 @@ __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* inpu
   DeviceHandle<SimpleProxyChannel>* proxyChannels = localPlan->channels.proxyChannels;
 
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
-  NpKit::CollectGpuEventShm(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp, event_buffer, &event_buffer_head);
+#if defined(MSCCLPP_DEVICE_HIP)
+  NpKit::CollectGpuEventShm(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, NPKIT_LOAD_CPU_TIMESTAMP_PER_BLOCK(cpuTimestamp, bid),
+#else
+  NpKit::CollectGpuEventShm(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp,
+#endif
+                            event_buffer, &event_buffer_head);
 #endif
 
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
diff --git a/src/npkit/npkit.cc b/src/npkit/npkit.cc
index a0fe2938..77db6b90 100644
--- a/src/npkit/npkit.cc
+++ b/src/npkit/npkit.cc
@@ -18,7 +18,11 @@ std::vector<std::unique_ptr<NpKitEvent[]>> NpKit::cpu_event_buffers_;
 mscclpp::UniqueCudaPtr<NpKitEventCollectContext> NpKit::gpu_collect_contexts_;
 std::unique_ptr<NpKitEventCollectContext[]> NpKit::cpu_collect_contexts_;
 
+#if defined(__HIP_PLATFORM_AMD__)
+mscclpp::UniqueCudaHostPtr<uint64_t[]> NpKit::cpu_timestamp_;
+#else
 mscclpp::UniqueCudaHostPtr<uint64_t> NpKit::cpu_timestamp_;
+#endif
 std::unique_ptr<std::thread> NpKit::cpu_timestamp_update_thread_;
 volatile bool NpKit::cpu_timestamp_update_thread_should_stop_ = false;
 
@@ -26,10 +30,18 @@ void NpKit::CpuTimestampUpdateThread() {
   uint64_t init_system_clock = std::chrono::system_clock::now().time_since_epoch().count();
   uint64_t init_steady_clock = std::chrono::steady_clock::now().time_since_epoch().count();
   uint64_t curr_steady_clock = 0;
-  volatile uint64_t* volatile_cpu_timestamp_ = cpu_timestamp_.get();
   while (!cpu_timestamp_update_thread_should_stop_) {
+#if defined(__HIP_PLATFORM_AMD__)
+    for (int i = 0; i < NPKIT_MAX_NUM_GPU_THREADBLOCKS; i++) {
+      curr_steady_clock = std::chrono::steady_clock::now().time_since_epoch().count();
+      NPKIT_STORE_CPU_TIMESTAMP_PER_BLOCK(cpu_timestamp_.get(),
+                                          init_system_clock + (curr_steady_clock - init_steady_clock), i);
+    }
+#else
     curr_steady_clock = std::chrono::steady_clock::now().time_since_epoch().count();
+    volatile uint64_t* volatile_cpu_timestamp_ = cpu_timestamp_.get();
     *volatile_cpu_timestamp_ = init_system_clock + (curr_steady_clock - init_steady_clock);
+#endif
   }
 }
 
@@ -55,10 +67,20 @@ void NpKit::Init(int rank) {
     cpu_collect_contexts_[i] = ctx;
   }
 
+#if defined(__HIP_PLATFORM_AMD__)
+  // Init timestamp. Allocates MAXCHANNELS*128 bytes buffer for GPU
+  cpu_timestamp_ = mscclpp::makeUniqueCudaHost<uint64_t[]>(NPKIT_MAX_NUM_GPU_THREADBLOCKS *
+                                                           NPKIT_CPU_TIMESTAMP_SLOT_SIZE / sizeof(uint64_t));
+  for (int i = 0; i < NPKIT_MAX_NUM_GPU_THREADBLOCKS; i++) {
+    NPKIT_STORE_CPU_TIMESTAMP_PER_BLOCK(cpu_timestamp_.get(),
+                                        std::chrono::system_clock::now().time_since_epoch().count(), i);
+  }
+#else
   // Init timestamp
   cpu_timestamp_ = mscclpp::makeUniqueCudaHost<uint64_t>();
   volatile uint64_t* volatile_cpu_timestamp = cpu_timestamp_.get();
   *volatile_cpu_timestamp = std::chrono::system_clock::now().time_since_epoch().count();
+#endif
   cpu_timestamp_update_thread_should_stop_ = false;
   cpu_timestamp_update_thread_ = std::make_unique<std::thread>(CpuTimestampUpdateThread);
 #else