// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT #pragma once #include namespace ck_tile { // GPU kernel to invalidate instruction cache for accurate benchmarking. // s_icache_inv: Asynchronously invalidates the L1 instruction cache on this compute unit, // forcing subsequent kernel runs to fetch instructions from HBM instead of cache. // 16x s_nop: Wait cycles (~16 cycles) to ensure cache invalidation completes before kernel // exits. Without these NOPs, the flush may not finish, leading to inconsistent // timing measurements where some instructions remain cached. static __global__ void flush_cache() { asm __volatile__("s_icache_inv \n\t" "s_nop 0 \n\t" "s_nop 0 \n\t" "s_nop 0 \n\t" "s_nop 0 \n\t" "s_nop 0 \n\t" "s_nop 0 \n\t" "s_nop 0 \n\t" "s_nop 0 \n\t" "s_nop 0 \n\t" "s_nop 0 \n\t" "s_nop 0 \n\t" "s_nop 0 \n\t" "s_nop 0 \n\t" "s_nop 0 \n\t" "s_nop 0 \n\t" "s_nop 0 \n\t" :: :); } } // namespace ck_tile