From 026c9200eef26a418985327dc20e8515b2fb9470 Mon Sep 17 00:00:00 2001 From: Cong Ma <142121551+CongMa13@users.noreply.github.com> Date: Wed, 7 Jan 2026 01:05:56 -0700 Subject: [PATCH] [CK TILE] Refactor function amd_buffer_load_invalid_element_return_zero (#3512) Refactor function amd_buffer_load_invalid_element_return_zero to avoid the inefficient ASM code generated by compiler. Compiler generates suboptimal assembly for ternary operator, causing excessive VGPR usage Tested compilers: - Rocm 7.0.1 - Rocm 7.1.1 Co-authored-by: Thomas Ning [ROCm/composable_kernel commit: d7497d26948ca90d0224920472712e0f657fb744] --- .../arch/amd_buffer_addressing_builtins.hpp | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp index 562b246ac3..9f9770df1b 100644 --- a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp +++ b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp @@ -2376,12 +2376,23 @@ amd_buffer_load_invalid_element_return_zero(const T* p_src_wave, return amd_buffer_load_impl( src_wave_buffer_resource, src_addr_shift + src_thread_addr_offset, 0); #else - thread_buffer tmp = - amd_buffer_load_impl(src_wave_buffer_resource, src_thread_addr_offset, 0); if constexpr(oob_conditional_check) - return src_thread_element_valid ? tmp : thread_buffer{numeric::zero()}; + { + if(src_thread_element_valid) + { + return amd_buffer_load_impl( + src_wave_buffer_resource, src_thread_addr_offset, 0); + } + else + { + return thread_buffer{numeric::zero()}; + } + } else - return tmp; + { + return amd_buffer_load_impl( + src_wave_buffer_resource, src_thread_addr_offset, 0); + } #endif }