mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-06-29 11:16:59 +00:00
The 3rd argument of buffer_load_dwordx4 is a scalar register.
But the compiler gnerates a waterwall loop as if lanes can have
a different value, even though the original values comes from as scalar
register:
v_mov_b32_e32 v187, s43
...
; %bb.65:
s_mov_b64 exec, s[0:1]
s_add_i32 s43, s40, 0x8400
s_mov_b64 s[0:1], exec
s_mov_b32 m0, s43
.LBB2_66: ; =>This Inner Loop Header: Depth=1
v_readfirstlane_b32 s51, v187
s_nop 1
v_cmp_eq_u32_e32 vcc, s51, v187
s_and_saveexec_b64 vcc, vcc
s_nop 0
buffer_load_dwordx4 v197, s[12:15], s51 offen lds
s_xor_b64 exec, exec, vcc
s_cbranch_execnz .LBB2_66