From 21e65bbb22a31bddd2067b5ae18060d44b8f0539 Mon Sep 17 00:00:00 2001
From: AviralGoelAMD <aviral.goel@amd.com>
Date: Wed, 15 Oct 2025 02:39:04 +0000
Subject: [PATCH] docs: add inline comments about flush_cache and rotating
 buffer

[ROCm/composable_kernel commit: b03764ca5a917752845ddbb5da8886051a16d9be]
---
 include/ck_tile/host/flush_icache.hpp     |  6 ++++
 include/ck_tile/host/rotating_buffers.hpp | 41 ++++++++++++++++++-----
 2 files changed, 39 insertions(+), 8 deletions(-)
diff --git a/include/ck_tile/host/flush_icache.hpp b/include/ck_tile/host/flush_icache.hpp
index 9230b50a13..f4852252be 100644
--- a/include/ck_tile/host/flush_icache.hpp
+++ b/include/ck_tile/host/flush_icache.hpp
@@ -6,6 +6,12 @@
 #include <hip/hip_runtime.h>
 
 namespace ck_tile {
+// GPU kernel to invalidate instruction cache for accurate benchmarking.
+// s_icache_inv: Asynchronously invalidates the L1 instruction cache on this compute unit,
+//               forcing subsequent kernel runs to fetch instructions from HBM instead of cache.
+// 16x s_nop:    Wait cycles (~16 cycles) to ensure cache invalidation completes before kernel
+//               exits. Without these NOPs, the flush may not finish, leading to inconsistent
+//               timing measurements where some instructions remain cached.
 static __global__ void flush_cache()
 {
     asm __volatile__("s_icache_inv \n\t"
diff --git a/include/ck_tile/host/rotating_buffers.hpp b/include/ck_tile/host/rotating_buffers.hpp
index 86f68ad084..154d67fb8e 100644
--- a/include/ck_tile/host/rotating_buffers.hpp
+++ b/include/ck_tile/host/rotating_buffers.hpp
@@ -9,6 +9,20 @@
 
 namespace ck_tile {
 
+// RotatingMemWrapper: Prevents GPU data cache reuse during kernel benchmarking.
+//
+// Purpose:
+//   When benchmarking a kernel repeatedly with the same input buffers, the GPU L2 cache
+//   will serve data from cache (hot) instead of HBM (cold), leading to artificially fast
+//   timing measurements. This wrapper rotates through multiple copies of buffers at different
+//   memory addresses to force cache misses.
+//
+// How it works:
+//   Constructor: Creates rotating_count copies of matrices A and B in GPU memory
+//   Next():      Switches pointers to the next buffer copy (cycles through all copies)
+//   Destructor:  Frees extra buffer copies and restores original pointers
+//
+// Combined with flush_icache(), this ensures realistic "cold cache" performance measurements.
 template <typename ADataType, typename BDataType>
 struct RotatingMemWrapper
 {
@@ -24,15 +38,18 @@ struct RotatingMemWrapper
           size_a(size_a_),
           size_b(size_b_)
     {
+        // Store original buffer pointers as first entry
         p_a_grids.push_back(a_ptr);
         p_b_grids.push_back(b_ptr);
+
+        // Create (rotating_count - 1) additional copies at different memory addresses
         for(size_t i = 1; i < rotating_count; i++)
         {
             {
                 void* pADeviceBuf;
                 HIP_CHECK_ERROR(hipMalloc(static_cast<void**>(&pADeviceBuf), size_a_));
-                HIP_CHECK_ERROR(hipMemcpy(static_cast<void*>(pADeviceBuf),
-                                          const_cast<void*>(p_a_grids[0]),
+                HIP_CHECK_ERROR(hipMemcpy(static_cast<void*>(pADeviceBuf), // target buffer
+                                          const_cast<void*>(p_a_grids[0]), // source buffer
                                           size_a_,
                                           hipMemcpyDeviceToDevice));
                 p_a_grids.push_back(pADeviceBuf);
@@ -41,19 +58,21 @@ struct RotatingMemWrapper
             {
                 void* pBDeviceBuf;
                 HIP_CHECK_ERROR(hipMalloc(static_cast<void**>(&pBDeviceBuf), size_b_));
-                HIP_CHECK_ERROR(hipMemcpy(static_cast<void*>(pBDeviceBuf),
-                                          const_cast<void*>(p_b_grids[0]),
+                HIP_CHECK_ERROR(hipMemcpy(static_cast<void*>(pBDeviceBuf), // target buffer
+                                          const_cast<void*>(p_b_grids[0]), // source buffer
                                           size_b_,
                                           hipMemcpyDeviceToDevice));
                 p_b_grids.push_back(pBDeviceBuf);
             }
         }
     }
+    // Rotate to the next buffer copy. Call this before each kernel run to use different
+    // memory addresses, forcing the GPU to fetch data from HBM instead of cache.
     void Next()
     {
         if(rotating_count > 1)
         {
-            std::size_t idx = iter++ % rotating_count;
+            std::size_t idx = iter++ % rotating_count; // Cycle through all buffer copies
             a_ptr           = p_a_grids[idx];
             b_ptr           = p_b_grids[idx];
         }
@@ -63,15 +82,16 @@ struct RotatingMemWrapper
         std::cout << "RotatingMemWrapper: { size_a: " << size_a << ", size_b: " << size_b
                   << ", rotating_count: " << rotating_count << "}" << std::endl;
     }
+    // Cleanup: Free all extra buffer copies (keeping original) and restore original pointers
     ~RotatingMemWrapper() noexcept
     {
         if(rotating_count > 1)
         {
-            // restore ptr
+            // Restore original buffer pointers
             a_ptr = p_a_grids[0];
             b_ptr = p_b_grids[0];
 
-            // free device mem
+            // Free extra buffer copies (index 0 is the original, don't free it)
             for(size_t i = 1; i < rotating_count; i++)
             {
                 ck_tile::hip_check_error(hipFree(const_cast<void*>(p_a_grids[i])));
@@ -94,7 +114,12 @@ inline void flush_icache()
 {
     hipDeviceProp_t deviceProps;
     HIP_CHECK_ERROR(hipGetDeviceProperties(&deviceProps, 0));
-    int32_t gpu_block3 = deviceProps.multiProcessorCount * 60;
+
+    // Over-provision blocks to ensure all CUs execute the flush instruction.
+    // With imperfect scheduling, launching exactly 1 block per CU doesn't guarantee coverage.
+    // 60x over-provisioning provides statistical certainty that every CU gets at least one block.
+    constexpr int32_t blocks_per_cu = 60;
+    int32_t gpu_block3              = deviceProps.multiProcessorCount * blocks_per_cu;
 
     ck_tile::flush_cache<<<dim3(gpu_block3), dim3(64), 0, nullptr>>>();
     HIP_CHECK_ERROR(hipGetLastError());