mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-04-20 06:49:15 +00:00
[CK TILE] Refactor function amd_buffer_load_invalid_element_return_zero (#3512)
Refactor function amd_buffer_load_invalid_element_return_zero to avoid the inefficient ASM code generated by compiler. Compiler generates suboptimal assembly for ternary operator, causing excessive VGPR usage Tested compilers: - Rocm 7.0.1 - Rocm 7.1.1 Co-authored-by: Thomas Ning <Thomas.Ning@amd.com>
This commit is contained in:
@@ -2376,12 +2376,23 @@ amd_buffer_load_invalid_element_return_zero(const T* p_src_wave,
|
||||
return amd_buffer_load_impl<T, N, coherence>(
|
||||
src_wave_buffer_resource, src_addr_shift + src_thread_addr_offset, 0);
|
||||
#else
|
||||
thread_buffer<T, N> tmp =
|
||||
amd_buffer_load_impl<T, N, coherence>(src_wave_buffer_resource, src_thread_addr_offset, 0);
|
||||
if constexpr(oob_conditional_check)
|
||||
return src_thread_element_valid ? tmp : thread_buffer<T, N>{numeric<T>::zero()};
|
||||
{
|
||||
if(src_thread_element_valid)
|
||||
{
|
||||
return amd_buffer_load_impl<T, N, coherence>(
|
||||
src_wave_buffer_resource, src_thread_addr_offset, 0);
|
||||
}
|
||||
else
|
||||
{
|
||||
return thread_buffer<T, N>{numeric<T>::zero()};
|
||||
}
|
||||
}
|
||||
else
|
||||
return tmp;
|
||||
{
|
||||
return amd_buffer_load_impl<T, N, coherence>(
|
||||
src_wave_buffer_resource, src_thread_addr_offset, 0);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user