[CK TILE] Refactor function amd_buffer_load_invalid_element_return_zero (#3512)

Refactor function amd_buffer_load_invalid_element_return_zero to avoid
the inefficient ASM code generated by compiler.

Compiler generates suboptimal assembly for ternary operator, causing excessive VGPR usage

Tested compilers:
- Rocm 7.0.1
- Rocm 7.1.1

Co-authored-by: Thomas Ning <Thomas.Ning@amd.com>
This commit is contained in:
Cong Ma
2026-01-07 01:05:56 -07:00
committed by GitHub
parent aaa35f0bbf
commit d7497d2694

View File

@@ -2376,12 +2376,23 @@ amd_buffer_load_invalid_element_return_zero(const T* p_src_wave,
return amd_buffer_load_impl<T, N, coherence>(
src_wave_buffer_resource, src_addr_shift + src_thread_addr_offset, 0);
#else
thread_buffer<T, N> tmp =
amd_buffer_load_impl<T, N, coherence>(src_wave_buffer_resource, src_thread_addr_offset, 0);
if constexpr(oob_conditional_check)
return src_thread_element_valid ? tmp : thread_buffer<T, N>{numeric<T>::zero()};
{
if(src_thread_element_valid)
{
return amd_buffer_load_impl<T, N, coherence>(
src_wave_buffer_resource, src_thread_addr_offset, 0);
}
else
{
return thread_buffer<T, N>{numeric<T>::zero()};
}
}
else
return tmp;
{
return amd_buffer_load_impl<T, N, coherence>(
src_wave_buffer_resource, src_thread_addr_offset, 0);
}
#endif
}