From 18108d0d540fff852ec20b1115a1c27b1ed7d009 Mon Sep 17 00:00:00 2001 From: Qianfeng Zhang Date: Fri, 12 Dec 2025 09:27:17 +0000 Subject: [PATCH] Fix with regard to define stride in MakeKLdsBlockDescriptor() --- .../hstu_attention_fwd_pipeline_default_policy.hpp | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/example/ck_tile/18_hstu_attention/hstu_attention_fwd_pipeline_default_policy.hpp b/example/ck_tile/18_hstu_attention/hstu_attention_fwd_pipeline_default_policy.hpp index 7c9241ebfb..1b7065dea4 100644 --- a/example/ck_tile/18_hstu_attention/hstu_attention_fwd_pipeline_default_policy.hpp +++ b/example/ck_tile/18_hstu_attention/hstu_attention_fwd_pipeline_default_policy.hpp @@ -404,20 +404,32 @@ struct HstuAttentionFwdPipelineQRKSVSDefaultPolicy { static_assert(kKVector == kKPack); + constexpr index_t KSingleSmemElementSpaceSize = kNPerBlock * kKPerBlock; + + static_assert(KSingleSmemElementSpaceSize == GetKSingleSmemElementSpaceSize()); + + constexpr index_t SingleSmemElementSpaceSize = GetSingleSmemElementSpaceSize(); + using KDataType = remove_cvref_t; constexpr index_t DataTypeSize = sizeof(KDataType); +#ifdef __gfx950__ + // 256 contiguous bytes mapped to 64 banks with each bank 4 contiguous bytes + constexpr auto NLdsLayer = + (64 * 4 / kKPerBlock / DataTypeSize) < 1 ? 1 : (64 * 4 / kKPerBlock / DataTypeSize); +#else // 128 contiguous bytes mapped to 32 banks with each bank 4 contiguous bytes constexpr auto NLdsLayer = (32 * 4 / kKPerBlock / DataTypeSize) < 1 ? 1 : (32 * 4 / kKPerBlock / DataTypeSize); +#endif constexpr auto k_lds_block_desc_0 = make_naive_tensor_descriptor(make_tuple(number{}, number{}, number{}, number{}), - make_tuple(number{}, + make_tuple(number{}, number{}, number{}, number<1>{}),