diff --git a/example/ck_tile/18_hstu_attention/hstu_attention_fwd_pipeline.hpp b/example/ck_tile/18_hstu_attention/hstu_attention_fwd_pipeline.hpp index 501699b277..ff5da4aca0 100644 --- a/example/ck_tile/18_hstu_attention/hstu_attention_fwd_pipeline.hpp +++ b/example/ck_tile/18_hstu_attention/hstu_attention_fwd_pipeline.hpp @@ -414,6 +414,8 @@ struct HstuAttentionFwdPipelineQRKSVS pcomp_tile = cast_tile(sacc_tile); + tile_elementwise_inout(f_silu, pcomp_tile); + if constexpr(std::is_same_v) { auto v_shuffle_tmp = make_static_distributed_tensor( @@ -436,10 +438,6 @@ struct HstuAttentionFwdPipelineQRKSVS tile_elementwise_in(v_element_func, v_tile)); // store the prefetch }; - __builtin_amdgcn_sched_barrier(0); - - tile_elementwise_inout(f_silu, pcomp_tile); - if constexpr(kHasDropout) { auto randval_lds_ptr = reinterpret_cast(smem_ptr) +