Add medium-tier small-cache optimization (zero overhead for <100K blocks)

- Add medium tier small-cache variants to unified_attention.cpp dispatch
- Create instance files for medium tier with MaxNumBlocks=100000
- Add instances to optCompilerConfig.json
- Results: zero overhead (6.062ms vs 6.067ms baseline) for 50K blocks
- Large cache (1.5M blocks) still works correctly with runtime rebasing
This commit is contained in:
juuso-oskari
2026-05-07 09:27:55 +00:00
parent 95f813013f
commit e9cf036a81
2 changed files with 30 additions and 0 deletions

View File

@@ -0,0 +1,15 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include "unified_attention.hpp"
#include "unified_attention_impl.hpp"
namespace ck_tile {
// Medium-tier small-cache optimized variant: MaxNumBlocks=100000 (zero rebasing overhead)
using kernel_traits =
unified_attention_decode_kernel_traits<unified_attention_args::data_type_enum::bf16, true, 64, 128, 8, 32, 100000>;
INST_UNIFIED_ATTENTION_DISPATCH(kernel_traits)
} // namespace ck_tile

View File

@@ -0,0 +1,15 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include "unified_attention.hpp"
#include "unified_attention_impl.hpp"
namespace ck_tile {
// Medium-tier small-cache optimized variant: MaxNumBlocks=100000 (zero rebasing overhead)
using kernel_traits =
unified_attention_decode_kernel_traits<unified_attention_args::data_type_enum::bf16, false, 64, 128, 8, 32, 100000>;
INST_UNIFIED_ATTENTION_DISPATCH(kernel_traits)
} // namespace ck_tile