mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-03-24 09:07:39 +00:00
37 lines
1.4 KiB
C++
37 lines
1.4 KiB
C++
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
|
// SPDX-License-Identifier: MIT
|
|
|
|
#pragma once
|
|
|
|
#include <hip/hip_runtime.h>
|
|
|
|
namespace ck_tile {
|
|
// GPU kernel to invalidate instruction cache for accurate benchmarking.
|
|
// s_icache_inv: Asynchronously invalidates the L1 instruction cache on this compute unit,
|
|
// forcing subsequent kernel runs to fetch instructions from HBM instead of cache.
|
|
// 16x s_nop: Wait cycles (~16 cycles) to ensure cache invalidation completes before kernel
|
|
// exits. Without these NOPs, the flush may not finish, leading to inconsistent
|
|
// timing measurements where some instructions remain cached.
|
|
static __global__ void flush_cache()
|
|
{
|
|
asm __volatile__("s_icache_inv \n\t"
|
|
"s_nop 0 \n\t"
|
|
"s_nop 0 \n\t"
|
|
"s_nop 0 \n\t"
|
|
"s_nop 0 \n\t"
|
|
"s_nop 0 \n\t"
|
|
"s_nop 0 \n\t"
|
|
"s_nop 0 \n\t"
|
|
"s_nop 0 \n\t"
|
|
"s_nop 0 \n\t"
|
|
"s_nop 0 \n\t"
|
|
"s_nop 0 \n\t"
|
|
"s_nop 0 \n\t"
|
|
"s_nop 0 \n\t"
|
|
"s_nop 0 \n\t"
|
|
"s_nop 0 \n\t"
|
|
"s_nop 0 \n\t" ::
|
|
:);
|
|
}
|
|
} // namespace ck_tile
|