From f8dea2bc86443374adc6b3076e72b8013262a099 Mon Sep 17 00:00:00 2001 From: Qianfeng Zhang Date: Tue, 9 Sep 2025 08:59:58 +0000 Subject: [PATCH] Use set_slice_tilie() to replace direct thread_buffer assignment --- .../18_hstu_attention/hstu_attention_fwd_pipeline.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/example/ck_tile/18_hstu_attention/hstu_attention_fwd_pipeline.hpp b/example/ck_tile/18_hstu_attention/hstu_attention_fwd_pipeline.hpp index d812c2d0ee..b97fd9d550 100644 --- a/example/ck_tile/18_hstu_attention/hstu_attention_fwd_pipeline.hpp +++ b/example/ck_tile/18_hstu_attention/hstu_attention_fwd_pipeline.hpp @@ -329,10 +329,10 @@ struct HstuAttentionFwdPipelineQRKSVS __builtin_amdgcn_s_waitcnt(0xc07f); // the following codes will not generate actual instructions by the compiler - static_for<0, splitted_tile_thread_buf_size, 1>{}([&](auto i_buf) { - q_tile.get_thread_buffer()[i_rep * splitted_tile_thread_buf_size + i_buf] = - q_reg_tiles[i_rep].get_thread_buffer()[i_buf]; - }); + set_slice_tile(q_tile, + q_reg_tiles[i_rep], + sequence{}, + sequence<(i_rep + 1) * kGemmSingleRepM, kQKHeaddim>{}); // no need to call __builtin_amdgcn_s_barrier() since the tile-slice read // by each wavefront is over-written by itself