From 2be2c3cd113b8524a53d5e2fd220d01f8b7b874d Mon Sep 17 00:00:00 2001 From: Qianfeng Zhang Date: Sat, 21 Feb 2026 14:46:31 +0000 Subject: [PATCH] Pass partition_index to get_x_indices_from_distributed_indices() to reduce calls of __builtin_amdgcn_readfirstlane() --- .../ck_tile/18_hstu_attention/hstu_attention_fwd_kernel.hpp | 3 ++- .../hstu_attention_no_softmax_fwd_pipeline.hpp | 4 +++- .../hstu_attention_no_softmax_fwd_trload_pipeline.hpp | 4 +++- .../hstu_attention_with_softmax_fwd_pipeline.hpp | 4 +++- .../hstu_attention_with_softmax_fwd_trload_pipeline.hpp | 4 +++- 5 files changed, 14 insertions(+), 5 deletions(-) diff --git a/example/ck_tile/18_hstu_attention/hstu_attention_fwd_kernel.hpp b/example/ck_tile/18_hstu_attention/hstu_attention_fwd_kernel.hpp index 2322fae18e..e88a3060cd 100644 --- a/example/ck_tile/18_hstu_attention/hstu_attention_fwd_kernel.hpp +++ b/example/ck_tile/18_hstu_attention/hstu_attention_fwd_kernel.hpp @@ -519,7 +519,8 @@ struct HstuAttentionFwdKernel seqlen_in_first_split = kargs.seqlen_q - num_target - kargs.min_full_attn_seqlen; index_t num_tile_in_first_split = - ck_tile::integer_divide_ceil(seqlen_in_first_split, HstuAttentionPipeline::kM0); + __builtin_amdgcn_readfirstlane(ck_tile::integer_divide_ceil( + seqlen_in_first_split, HstuAttentionPipeline::kM0)); is_tile_in_first_split = (i_tile_m < num_tile_in_first_split); diff --git a/example/ck_tile/18_hstu_attention/hstu_attention_no_softmax_fwd_pipeline.hpp b/example/ck_tile/18_hstu_attention/hstu_attention_no_softmax_fwd_pipeline.hpp index 8fe2f99561..e706693b85 100644 --- a/example/ck_tile/18_hstu_attention/hstu_attention_no_softmax_fwd_pipeline.hpp +++ b/example/ck_tile/18_hstu_attention/hstu_attention_no_softmax_fwd_pipeline.hpp @@ -361,7 +361,9 @@ struct HstuAttentionNoSoftmaxFwdPipelineQRKSVS sweep_tile_span(p_spans[number<0>{}], [&](auto idx0) { sweep_tile_span(p_spans[number<1>{}], [&](auto idx1) { const auto tile_idx = get_x_indices_from_distributed_indices( - pcomp_tile.get_tile_distribution(), make_tuple(idx0, idx1)); + pcomp_tile.get_tile_distribution(), + make_tuple(idx0, idx1), + partition_index); const auto row = q_origin.at(number<0>{}) + tile_idx.at(number<0>{}); const auto col = seqlen_k_curr + tile_idx.at(number<1>{}); diff --git a/example/ck_tile/18_hstu_attention/hstu_attention_no_softmax_fwd_trload_pipeline.hpp b/example/ck_tile/18_hstu_attention/hstu_attention_no_softmax_fwd_trload_pipeline.hpp index ecb8c8a9bc..10e02aaf36 100644 --- a/example/ck_tile/18_hstu_attention/hstu_attention_no_softmax_fwd_trload_pipeline.hpp +++ b/example/ck_tile/18_hstu_attention/hstu_attention_no_softmax_fwd_trload_pipeline.hpp @@ -363,7 +363,9 @@ struct HstuAttentionNoSoftmaxFwdPipelineQRKSVSTrLoad sweep_tile_span(p_spans[number<0>{}], [&](auto idx0) { sweep_tile_span(p_spans[number<1>{}], [&](auto idx1) { const auto tile_idx = get_x_indices_from_distributed_indices( - pcomp_tile.get_tile_distribution(), make_tuple(idx0, idx1)); + pcomp_tile.get_tile_distribution(), + make_tuple(idx0, idx1), + partition_index); const auto row = q_origin.at(number<0>{}) + tile_idx.at(number<0>{}); const auto col = seqlen_k_curr + tile_idx.at(number<1>{}); diff --git a/example/ck_tile/18_hstu_attention/hstu_attention_with_softmax_fwd_pipeline.hpp b/example/ck_tile/18_hstu_attention/hstu_attention_with_softmax_fwd_pipeline.hpp index 38a6319fbe..c2e6907e3d 100644 --- a/example/ck_tile/18_hstu_attention/hstu_attention_with_softmax_fwd_pipeline.hpp +++ b/example/ck_tile/18_hstu_attention/hstu_attention_with_softmax_fwd_pipeline.hpp @@ -392,7 +392,9 @@ struct HstuAttentionWithSoftmaxFwdPipelineQRKSVS sweep_tile_span(p_spans[number<0>{}], [&](auto idx0) { sweep_tile_span(p_spans[number<1>{}], [&](auto idx1) { const auto tile_idx = get_x_indices_from_distributed_indices( - pcomp_tile.get_tile_distribution(), make_tuple(idx0, idx1)); + pcomp_tile.get_tile_distribution(), + make_tuple(idx0, idx1), + partition_index); const auto row = q_origin.at(number<0>{}) + tile_idx.at(number<0>{}); const auto col = seqlen_k_curr + tile_idx.at(number<1>{}); diff --git a/example/ck_tile/18_hstu_attention/hstu_attention_with_softmax_fwd_trload_pipeline.hpp b/example/ck_tile/18_hstu_attention/hstu_attention_with_softmax_fwd_trload_pipeline.hpp index 7db48fc368..5df7dbabea 100644 --- a/example/ck_tile/18_hstu_attention/hstu_attention_with_softmax_fwd_trload_pipeline.hpp +++ b/example/ck_tile/18_hstu_attention/hstu_attention_with_softmax_fwd_trload_pipeline.hpp @@ -394,7 +394,9 @@ struct HstuAttentionWithSoftmaxFwdPipelineQRKSVSTrLoad sweep_tile_span(p_spans[number<0>{}], [&](auto idx0) { sweep_tile_span(p_spans[number<1>{}], [&](auto idx1) { const auto tile_idx = get_x_indices_from_distributed_indices( - pcomp_tile.get_tile_distribution(), make_tuple(idx0, idx1)); + pcomp_tile.get_tile_distribution(), + make_tuple(idx0, idx1), + partition_index); const auto row = q_origin.at(number<0>{}) + tile_idx.at(number<0>{}); const auto col = seqlen_k_curr + tile_idx.at(number<1>{});