From 21e65bbb22a31bddd2067b5ae18060d44b8f0539 Mon Sep 17 00:00:00 2001 From: AviralGoelAMD Date: Wed, 15 Oct 2025 02:39:04 +0000 Subject: [PATCH] docs: add inline comments about flush_cache and rotating buffer [ROCm/composable_kernel commit: b03764ca5a917752845ddbb5da8886051a16d9be] --- include/ck_tile/host/flush_icache.hpp | 6 ++++ include/ck_tile/host/rotating_buffers.hpp | 41 ++++++++++++++++++----- 2 files changed, 39 insertions(+), 8 deletions(-) diff --git a/include/ck_tile/host/flush_icache.hpp b/include/ck_tile/host/flush_icache.hpp index 9230b50a13..f4852252be 100644 --- a/include/ck_tile/host/flush_icache.hpp +++ b/include/ck_tile/host/flush_icache.hpp @@ -6,6 +6,12 @@ #include namespace ck_tile { +// GPU kernel to invalidate instruction cache for accurate benchmarking. +// s_icache_inv: Asynchronously invalidates the L1 instruction cache on this compute unit, +// forcing subsequent kernel runs to fetch instructions from HBM instead of cache. +// 16x s_nop: Wait cycles (~16 cycles) to ensure cache invalidation completes before kernel +// exits. Without these NOPs, the flush may not finish, leading to inconsistent +// timing measurements where some instructions remain cached. static __global__ void flush_cache() { asm __volatile__("s_icache_inv \n\t" diff --git a/include/ck_tile/host/rotating_buffers.hpp b/include/ck_tile/host/rotating_buffers.hpp index 86f68ad084..154d67fb8e 100644 --- a/include/ck_tile/host/rotating_buffers.hpp +++ b/include/ck_tile/host/rotating_buffers.hpp @@ -9,6 +9,20 @@ namespace ck_tile { +// RotatingMemWrapper: Prevents GPU data cache reuse during kernel benchmarking. +// +// Purpose: +// When benchmarking a kernel repeatedly with the same input buffers, the GPU L2 cache +// will serve data from cache (hot) instead of HBM (cold), leading to artificially fast +// timing measurements. This wrapper rotates through multiple copies of buffers at different +// memory addresses to force cache misses. +// +// How it works: +// Constructor: Creates rotating_count copies of matrices A and B in GPU memory +// Next(): Switches pointers to the next buffer copy (cycles through all copies) +// Destructor: Frees extra buffer copies and restores original pointers +// +// Combined with flush_icache(), this ensures realistic "cold cache" performance measurements. template struct RotatingMemWrapper { @@ -24,15 +38,18 @@ struct RotatingMemWrapper size_a(size_a_), size_b(size_b_) { + // Store original buffer pointers as first entry p_a_grids.push_back(a_ptr); p_b_grids.push_back(b_ptr); + + // Create (rotating_count - 1) additional copies at different memory addresses for(size_t i = 1; i < rotating_count; i++) { { void* pADeviceBuf; HIP_CHECK_ERROR(hipMalloc(static_cast(&pADeviceBuf), size_a_)); - HIP_CHECK_ERROR(hipMemcpy(static_cast(pADeviceBuf), - const_cast(p_a_grids[0]), + HIP_CHECK_ERROR(hipMemcpy(static_cast(pADeviceBuf), // target buffer + const_cast(p_a_grids[0]), // source buffer size_a_, hipMemcpyDeviceToDevice)); p_a_grids.push_back(pADeviceBuf); @@ -41,19 +58,21 @@ struct RotatingMemWrapper { void* pBDeviceBuf; HIP_CHECK_ERROR(hipMalloc(static_cast(&pBDeviceBuf), size_b_)); - HIP_CHECK_ERROR(hipMemcpy(static_cast(pBDeviceBuf), - const_cast(p_b_grids[0]), + HIP_CHECK_ERROR(hipMemcpy(static_cast(pBDeviceBuf), // target buffer + const_cast(p_b_grids[0]), // source buffer size_b_, hipMemcpyDeviceToDevice)); p_b_grids.push_back(pBDeviceBuf); } } } + // Rotate to the next buffer copy. Call this before each kernel run to use different + // memory addresses, forcing the GPU to fetch data from HBM instead of cache. void Next() { if(rotating_count > 1) { - std::size_t idx = iter++ % rotating_count; + std::size_t idx = iter++ % rotating_count; // Cycle through all buffer copies a_ptr = p_a_grids[idx]; b_ptr = p_b_grids[idx]; } @@ -63,15 +82,16 @@ struct RotatingMemWrapper std::cout << "RotatingMemWrapper: { size_a: " << size_a << ", size_b: " << size_b << ", rotating_count: " << rotating_count << "}" << std::endl; } + // Cleanup: Free all extra buffer copies (keeping original) and restore original pointers ~RotatingMemWrapper() noexcept { if(rotating_count > 1) { - // restore ptr + // Restore original buffer pointers a_ptr = p_a_grids[0]; b_ptr = p_b_grids[0]; - // free device mem + // Free extra buffer copies (index 0 is the original, don't free it) for(size_t i = 1; i < rotating_count; i++) { ck_tile::hip_check_error(hipFree(const_cast(p_a_grids[i]))); @@ -94,7 +114,12 @@ inline void flush_icache() { hipDeviceProp_t deviceProps; HIP_CHECK_ERROR(hipGetDeviceProperties(&deviceProps, 0)); - int32_t gpu_block3 = deviceProps.multiProcessorCount * 60; + + // Over-provision blocks to ensure all CUs execute the flush instruction. + // With imperfect scheduling, launching exactly 1 block per CU doesn't guarantee coverage. + // 60x over-provisioning provides statistical certainty that every CU gets at least one block. + constexpr int32_t blocks_per_cu = 60; + int32_t gpu_block3 = deviceProps.multiProcessorCount * blocks_per_cu; ck_tile::flush_cache<<>>(); HIP_CHECK_ERROR(hipGetLastError());