mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-16 02:54:21 +00:00
docs: add inline comments about flush_cache and rotating buffer
[ROCm/composable_kernel commit: b03764ca5a]
This commit is contained in:
committed by
Aviral Goel
parent
89fb435ce2
commit
21e65bbb22
@@ -6,6 +6,12 @@
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
namespace ck_tile {
|
||||
// GPU kernel to invalidate instruction cache for accurate benchmarking.
|
||||
// s_icache_inv: Asynchronously invalidates the L1 instruction cache on this compute unit,
|
||||
// forcing subsequent kernel runs to fetch instructions from HBM instead of cache.
|
||||
// 16x s_nop: Wait cycles (~16 cycles) to ensure cache invalidation completes before kernel
|
||||
// exits. Without these NOPs, the flush may not finish, leading to inconsistent
|
||||
// timing measurements where some instructions remain cached.
|
||||
static __global__ void flush_cache()
|
||||
{
|
||||
asm __volatile__("s_icache_inv \n\t"
|
||||
|
||||
@@ -9,6 +9,20 @@
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
// RotatingMemWrapper: Prevents GPU data cache reuse during kernel benchmarking.
|
||||
//
|
||||
// Purpose:
|
||||
// When benchmarking a kernel repeatedly with the same input buffers, the GPU L2 cache
|
||||
// will serve data from cache (hot) instead of HBM (cold), leading to artificially fast
|
||||
// timing measurements. This wrapper rotates through multiple copies of buffers at different
|
||||
// memory addresses to force cache misses.
|
||||
//
|
||||
// How it works:
|
||||
// Constructor: Creates rotating_count copies of matrices A and B in GPU memory
|
||||
// Next(): Switches pointers to the next buffer copy (cycles through all copies)
|
||||
// Destructor: Frees extra buffer copies and restores original pointers
|
||||
//
|
||||
// Combined with flush_icache(), this ensures realistic "cold cache" performance measurements.
|
||||
template <typename ADataType, typename BDataType>
|
||||
struct RotatingMemWrapper
|
||||
{
|
||||
@@ -24,15 +38,18 @@ struct RotatingMemWrapper
|
||||
size_a(size_a_),
|
||||
size_b(size_b_)
|
||||
{
|
||||
// Store original buffer pointers as first entry
|
||||
p_a_grids.push_back(a_ptr);
|
||||
p_b_grids.push_back(b_ptr);
|
||||
|
||||
// Create (rotating_count - 1) additional copies at different memory addresses
|
||||
for(size_t i = 1; i < rotating_count; i++)
|
||||
{
|
||||
{
|
||||
void* pADeviceBuf;
|
||||
HIP_CHECK_ERROR(hipMalloc(static_cast<void**>(&pADeviceBuf), size_a_));
|
||||
HIP_CHECK_ERROR(hipMemcpy(static_cast<void*>(pADeviceBuf),
|
||||
const_cast<void*>(p_a_grids[0]),
|
||||
HIP_CHECK_ERROR(hipMemcpy(static_cast<void*>(pADeviceBuf), // target buffer
|
||||
const_cast<void*>(p_a_grids[0]), // source buffer
|
||||
size_a_,
|
||||
hipMemcpyDeviceToDevice));
|
||||
p_a_grids.push_back(pADeviceBuf);
|
||||
@@ -41,19 +58,21 @@ struct RotatingMemWrapper
|
||||
{
|
||||
void* pBDeviceBuf;
|
||||
HIP_CHECK_ERROR(hipMalloc(static_cast<void**>(&pBDeviceBuf), size_b_));
|
||||
HIP_CHECK_ERROR(hipMemcpy(static_cast<void*>(pBDeviceBuf),
|
||||
const_cast<void*>(p_b_grids[0]),
|
||||
HIP_CHECK_ERROR(hipMemcpy(static_cast<void*>(pBDeviceBuf), // target buffer
|
||||
const_cast<void*>(p_b_grids[0]), // source buffer
|
||||
size_b_,
|
||||
hipMemcpyDeviceToDevice));
|
||||
p_b_grids.push_back(pBDeviceBuf);
|
||||
}
|
||||
}
|
||||
}
|
||||
// Rotate to the next buffer copy. Call this before each kernel run to use different
|
||||
// memory addresses, forcing the GPU to fetch data from HBM instead of cache.
|
||||
void Next()
|
||||
{
|
||||
if(rotating_count > 1)
|
||||
{
|
||||
std::size_t idx = iter++ % rotating_count;
|
||||
std::size_t idx = iter++ % rotating_count; // Cycle through all buffer copies
|
||||
a_ptr = p_a_grids[idx];
|
||||
b_ptr = p_b_grids[idx];
|
||||
}
|
||||
@@ -63,15 +82,16 @@ struct RotatingMemWrapper
|
||||
std::cout << "RotatingMemWrapper: { size_a: " << size_a << ", size_b: " << size_b
|
||||
<< ", rotating_count: " << rotating_count << "}" << std::endl;
|
||||
}
|
||||
// Cleanup: Free all extra buffer copies (keeping original) and restore original pointers
|
||||
~RotatingMemWrapper() noexcept
|
||||
{
|
||||
if(rotating_count > 1)
|
||||
{
|
||||
// restore ptr
|
||||
// Restore original buffer pointers
|
||||
a_ptr = p_a_grids[0];
|
||||
b_ptr = p_b_grids[0];
|
||||
|
||||
// free device mem
|
||||
// Free extra buffer copies (index 0 is the original, don't free it)
|
||||
for(size_t i = 1; i < rotating_count; i++)
|
||||
{
|
||||
ck_tile::hip_check_error(hipFree(const_cast<void*>(p_a_grids[i])));
|
||||
@@ -94,7 +114,12 @@ inline void flush_icache()
|
||||
{
|
||||
hipDeviceProp_t deviceProps;
|
||||
HIP_CHECK_ERROR(hipGetDeviceProperties(&deviceProps, 0));
|
||||
int32_t gpu_block3 = deviceProps.multiProcessorCount * 60;
|
||||
|
||||
// Over-provision blocks to ensure all CUs execute the flush instruction.
|
||||
// With imperfect scheduling, launching exactly 1 block per CU doesn't guarantee coverage.
|
||||
// 60x over-provisioning provides statistical certainty that every CU gets at least one block.
|
||||
constexpr int32_t blocks_per_cu = 60;
|
||||
int32_t gpu_block3 = deviceProps.multiProcessorCount * blocks_per_cu;
|
||||
|
||||
ck_tile::flush_cache<<<dim3(gpu_block3), dim3(64), 0, nullptr>>>();
|
||||
HIP_CHECK_ERROR(hipGetLastError());
|
||||
|
||||
Reference in New Issue
Block a user