[CK_TILE] Add fmha fwd headdim96 support (#1608)

* Add ceil_to_qualified_tile_length() * Rename kK0BlockLength to kQKHeaddim * Add kSubQKHeaddim concept to support headdim96 * Fix in math.hpp to avoid using __half interfaces * Add LdsBufferSequence instance for headdim96 * Update in fmha_fwd/fmha_fwd_splitkv codegen to support hd96 testing * Disable hd96 instance generation in codegen fmha_fwd and fmha_fwd_splitkv to save compiling time * Reformat one file * Fix text alignment in fmha_fwd_splitkv.py --------- Co-authored-by: Po Yen Chen <PoYen.Chen@amd.com>
2026-05-04 21:51:28 +00:00 · 2024-10-30 14:03:16 +08:00
parent 4d7e063a0a
commit 8632221814
12 changed files with 153 additions and 107 deletions
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
@@ -82,10 +82,10 @@ struct FmhaFwdKernel
            if (kPadHeadDimV) n += "dv";
            return n.empty() ? n : std::string("p") + n; }();
        return
-            _SS_("fmha_fwd_d") + _TS_(bfs::kK0BlockLength) + "_" + _SS_(t2s<QDataType>::name) +
+            _SS_("fmha_fwd_d") + _TS_(bfs::kQKHeaddim) + "_" + _SS_(t2s<QDataType>::name) +
            "_" + (kIsGroupMode ? "group" : "batch") + "_" + _SS_(TilePartitioner::name) + "_"
            "b" + _TS_(bfs::kM0) + "x" + _TS_(bfs::kN0) + "x" + _TS_(bfs::kK0) + "x" +
-                    _TS_(bfs::kN1) + "x" + _TS_(bfs::kK1) + "x" + _TS_(bfs::kK0BlockLength) + "_" +
+                    _TS_(bfs::kN1) + "x" + _TS_(bfs::kK1) + "x" + _TS_(bfs::kQKHeaddim) + "_" +
            "r" + _TS_(g0br::at(ck_tile::number<0>{})) + "x" + _TS_(g0br::at(ck_tile::number<1>{})) + "x" + _TS_(g0br::at(ck_tile::number<2>{})) + "_" +
            "r" + _TS_(g1br::at(ck_tile::number<0>{})) + "x" + _TS_(g1br::at(ck_tile::number<1>{})) + "x" + _TS_(g1br::at(ck_tile::number<2>{})) + "_" +
            "w" + _TS_(gwt::at(ck_tile::number<0>{})) + "x" + _TS_(gwt::at(ck_tile::number<1>{})) + "x" + _TS_(gwt::at(ck_tile::number<2>{})) + "_" +
@@ -657,7 +657,7 @@ struct FmhaFwdKernel
            {
                return pad_tensor_view(
                    q_dram_naive,
-                    make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kK0BlockLength>{}),
+                    make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kSubQKHeaddim>{}),
                    sequence<kPadSeqLenQ, kPadHeadDimQ>{});
            }
            else
@@ -724,7 +724,7 @@ struct FmhaFwdKernel
            [&]() {
                if constexpr(FmhaPipeline::kQLoadOnce)
                    return make_tuple(number<FmhaPipeline::kM0>{},
-                                      number<FmhaPipeline::kK0BlockLength>{});
+                                      number<FmhaPipeline::kSubQKHeaddim>{});
                else
                    return make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kK0>{});
            }(),
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
@@ -78,10 +78,10 @@ struct FmhaFwdSplitKVKernel
            if (kPadHeadDimV) n += "dv";
            return n.empty() ? n : std::string("p") + n; }();
        return
-            _SS_("fmha_fwd_splitkv_d") + _TS_(bfs::kK0BlockLength) + "_" + _SS_(t2s<QDataType>::name) +
+            _SS_("fmha_fwd_splitkv_d") + _TS_(bfs::kQKHeaddim) + "_" + _SS_(t2s<QDataType>::name) +
            "_" + (kIsGroupMode ? "group" : "batch") + "_"
            "b" + _TS_(bfs::kM0) + "x" + _TS_(bfs::kN0) + "x" + _TS_(bfs::kK0) + "x" +
-                    _TS_(bfs::kN1) + "x" + _TS_(bfs::kK1) + "x" + _TS_(bfs::kK0BlockLength) + "_" +
+                    _TS_(bfs::kN1) + "x" + _TS_(bfs::kK1) + "x" + _TS_(bfs::kQKHeaddim) + "_" +
            "r" + _TS_(g0br::at(ck_tile::number<0>{})) + "x" + _TS_(g0br::at(ck_tile::number<1>{})) + "x" + _TS_(g0br::at(ck_tile::number<2>{})) + "_" +
            "r" + _TS_(g1br::at(ck_tile::number<0>{})) + "x" + _TS_(g1br::at(ck_tile::number<1>{})) + "x" + _TS_(g1br::at(ck_tile::number<2>{})) + "_" +
            "w" + _TS_(gwt::at(ck_tile::number<0>{})) + "x" + _TS_(gwt::at(ck_tile::number<1>{})) + "x" + _TS_(gwt::at(ck_tile::number<2>{})) + "_" +
@@ -586,7 +586,7 @@ struct FmhaFwdSplitKVKernel
            {
                return pad_tensor_view(
                    q_dram_naive,
-                    make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kK0BlockLength>{}),
+                    make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kSubQKHeaddim>{}),
                    sequence<kPadSeqLenQ, kPadHeadDimQ>{});
            }
            else
@@ -735,7 +735,7 @@ struct FmhaFwdSplitKVKernel
            [&]() {
                if constexpr(FmhaPipeline::kQLoadOnce)
                    return make_tuple(number<FmhaPipeline::kM0>{},
-                                      number<FmhaPipeline::kK0BlockLength>{});
+                                      number<FmhaPipeline::kSubQKHeaddim>{});
                else
                    return make_tuple(number<FmhaPipeline::kM0>{}, number<FmhaPipeline::kK0>{});
            }(),
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp
@@ -34,12 +34,13 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS

    static constexpr index_t kBlockSize = Problem::kBlockSize;

-    static constexpr index_t kM0            = BlockFmhaShape::kM0;
-    static constexpr index_t kN0            = BlockFmhaShape::kN0;
-    static constexpr index_t kK0            = BlockFmhaShape::kK0;
-    static constexpr index_t kN1            = BlockFmhaShape::kN1;
-    static constexpr index_t kK1            = BlockFmhaShape::kK1;
-    static constexpr index_t kK0BlockLength = BlockFmhaShape::kK0BlockLength;
+    static constexpr index_t kM0           = BlockFmhaShape::kM0;
+    static constexpr index_t kN0           = BlockFmhaShape::kN0;
+    static constexpr index_t kK0           = BlockFmhaShape::kK0;
+    static constexpr index_t kN1           = BlockFmhaShape::kN1;
+    static constexpr index_t kK1           = BlockFmhaShape::kK1;
+    static constexpr index_t kQKHeaddim    = BlockFmhaShape::kQKHeaddim;
+    static constexpr index_t kSubQKHeaddim = BlockFmhaShape::kSubQKHeaddim;

    static constexpr bool kIsGroupMode     = Problem::kIsGroupMode;
    static constexpr bool kPadSeqLenQ      = Problem::kPadSeqLenQ;
@@ -75,22 +76,22 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS
            return Problem::kBlockPerCu;
        else
        {
-            if constexpr(kK0BlockLength <= 32)
+            if constexpr(kQKHeaddim <= 32)
            {
                return 2;
            }
-            else if constexpr(kK0BlockLength <= 64)
+            else if constexpr(kQKHeaddim <= 64)
            {
                return 3;
            }
-            else if constexpr(kK0BlockLength <= 128)
+            else if constexpr(kQKHeaddim <= 128)
            {
                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
                    return 1;
                else
                    return 2;
            }
-            else if constexpr(kK0BlockLength <= 256)
+            else if constexpr(kQKHeaddim <= 256)
            {
                return 1;
            }
@@ -270,7 +271,7 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS

        // prefetch K tile
        index_t i_total_loops      = 0;
-        constexpr index_t k0_loops = kK0BlockLength / kK0;
+        constexpr index_t k0_loops = kQKHeaddim / kK0;
        constexpr index_t k1_loops = kN0 / kK1;

        static_assert(2 <= k0_loops);
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs.hpp
@@ -37,12 +37,13 @@ struct BlockFmhaPipelineQRKSVS

    static constexpr index_t kBlockSize = Problem::kBlockSize;

-    static constexpr index_t kM0            = BlockFmhaShape::kM0;
-    static constexpr index_t kN0            = BlockFmhaShape::kN0;
-    static constexpr index_t kK0            = BlockFmhaShape::kK0;
-    static constexpr index_t kN1            = BlockFmhaShape::kN1;
-    static constexpr index_t kK1            = BlockFmhaShape::kK1;
-    static constexpr index_t kK0BlockLength = BlockFmhaShape::kK0BlockLength;
+    static constexpr index_t kM0           = BlockFmhaShape::kM0;
+    static constexpr index_t kN0           = BlockFmhaShape::kN0;
+    static constexpr index_t kK0           = BlockFmhaShape::kK0;
+    static constexpr index_t kN1           = BlockFmhaShape::kN1;
+    static constexpr index_t kK1           = BlockFmhaShape::kK1;
+    static constexpr index_t kQKHeaddim    = BlockFmhaShape::kQKHeaddim;
+    static constexpr index_t kSubQKHeaddim = BlockFmhaShape::kSubQKHeaddim;

    static constexpr bool kIsGroupMode = Problem::kIsGroupMode;
    static constexpr bool kPadSeqLenQ  = Problem::kPadSeqLenQ;
@@ -76,22 +77,22 @@ struct BlockFmhaPipelineQRKSVS
            return Problem::kBlockPerCu;
        else
        {
-            if constexpr(kK0BlockLength <= 32)
+            if constexpr(kQKHeaddim <= 32)
            {
                return 2;
            }
-            else if constexpr(kK0BlockLength <= 64)
+            else if constexpr(kQKHeaddim <= 64)
            {
                return 3;
            }
-            else if constexpr(kK0BlockLength <= 128)
+            else if constexpr(kQKHeaddim <= 128)
            {
                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
                    return 1;
                else
                    return 2;
            }
-            else if constexpr(kK0BlockLength <= 256)
+            else if constexpr(kQKHeaddim <= 256)
            {
                return 1;
            }
@@ -261,7 +262,7 @@ struct BlockFmhaPipelineQRKSVS

        // prefetch K tile
        index_t i_total_loops      = 0;
-        constexpr index_t k0_loops = kK0BlockLength / kK0;
+        constexpr index_t k0_loops = kQKHeaddim / kK0;
        constexpr index_t k1_loops = kN0 / kK1;

        static_assert(2 <= k0_loops);
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_async.hpp
@@ -38,12 +38,13 @@ struct BlockFmhaPipelineQRKSVSAsync

    static constexpr index_t kBlockSize = Problem::kBlockSize;

-    static constexpr index_t kM0            = BlockFmhaShape::kM0;
-    static constexpr index_t kN0            = BlockFmhaShape::kN0;
-    static constexpr index_t kK0            = BlockFmhaShape::kK0;
-    static constexpr index_t kN1            = BlockFmhaShape::kN1;
-    static constexpr index_t kK1            = BlockFmhaShape::kK1;
-    static constexpr index_t kK0BlockLength = BlockFmhaShape::kK0BlockLength;
+    static constexpr index_t kM0           = BlockFmhaShape::kM0;
+    static constexpr index_t kN0           = BlockFmhaShape::kN0;
+    static constexpr index_t kK0           = BlockFmhaShape::kK0;
+    static constexpr index_t kN1           = BlockFmhaShape::kN1;
+    static constexpr index_t kK1           = BlockFmhaShape::kK1;
+    static constexpr index_t kQKHeaddim    = BlockFmhaShape::kQKHeaddim;
+    static constexpr index_t kSubQKHeaddim = BlockFmhaShape::kSubQKHeaddim;

    static constexpr bool kIsGroupMode = Problem::kIsGroupMode;
    // TODO: seq_q always support padding, hdim_q/v support multiple of vector(like 8x)
@@ -87,7 +88,7 @@ struct BlockFmhaPipelineQRKSVSAsync
                return 1;
            }

-            if constexpr(kK0BlockLength <= 32)
+            if constexpr(kQKHeaddim <= 32)
            {
                if constexpr(kPadSeqLenK && BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS &&
                             FmhaMask::IsMasking)
@@ -95,21 +96,21 @@ struct BlockFmhaPipelineQRKSVSAsync
                else
                    return 2;
            }
-            else if constexpr(kK0BlockLength <= 64)
+            else if constexpr(kQKHeaddim <= 64)
            {
                if constexpr(kPadSeqLenK && BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
                    return 2;
                else
                    return 3;
            }
-            else if constexpr(kK0BlockLength <= 128)
+            else if constexpr(kQKHeaddim <= 128)
            {
                if constexpr(kPadSeqLenK && BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
                    return 1;
                else
                    return 2;
            }
-            else if constexpr(kK0BlockLength <= 256)
+            else if constexpr(kQKHeaddim <= 256)
            {
                return 1;
            }
@@ -339,7 +340,7 @@ struct BlockFmhaPipelineQRKSVSAsync
        // auto q_tile = q;      // tile_elementwise_in(q_element_func, q);

        index_t i_total_loops      = 0;
-        constexpr index_t k0_loops = kK0BlockLength / kK0;
+        constexpr index_t k0_loops = kQKHeaddim / kK0;
        constexpr index_t k1_loops = kN0 / kK1;

        static_assert(1 <= k0_loops);
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_fp8.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qr_ks_vs_fp8.hpp
@@ -36,12 +36,12 @@ struct [[deprecated]] BlockFmhaPipelineQRKSVSFp8

    static constexpr index_t kBlockSize = Problem::kBlockSize;

-    static constexpr index_t kM0            = BlockFmhaShape::kM0;
-    static constexpr index_t kN0            = BlockFmhaShape::kN0;
-    static constexpr index_t kK0            = BlockFmhaShape::kK0;
-    static constexpr index_t kN1            = BlockFmhaShape::kN1;
-    static constexpr index_t kK1            = BlockFmhaShape::kK1;
-    static constexpr index_t kK0BlockLength = BlockFmhaShape::kK0BlockLength;
+    static constexpr index_t kM0        = BlockFmhaShape::kM0;
+    static constexpr index_t kN0        = BlockFmhaShape::kN0;
+    static constexpr index_t kK0        = BlockFmhaShape::kK0;
+    static constexpr index_t kN1        = BlockFmhaShape::kN1;
+    static constexpr index_t kK1        = BlockFmhaShape::kK1;
+    static constexpr index_t kQKHeaddim = BlockFmhaShape::kQKHeaddim;

    static constexpr bool kIsGroupMode = Problem::kIsGroupMode;
    static constexpr bool kPadSeqLenQ  = Problem::kPadSeqLenQ;
@@ -75,22 +75,22 @@ struct [[deprecated]] BlockFmhaPipelineQRKSVSFp8
            return Problem::kBlockPerCu;
        else
        {
-            if constexpr(kK0BlockLength <= 32)
+            if constexpr(kQKHeaddim <= 32)
            {
                return 2;
            }
-            else if constexpr(kK0BlockLength <= 64)
+            else if constexpr(kQKHeaddim <= 64)
            {
                return 3;
            }
-            else if constexpr(kK0BlockLength <= 128)
+            else if constexpr(kQKHeaddim <= 128)
            {
                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
                    return 1;
                else
                    return 2;
            }
-            else if constexpr(kK0BlockLength <= 256)
+            else if constexpr(kQKHeaddim <= 256)
            {
                return 1;
            }
@@ -232,7 +232,7 @@ struct [[deprecated]] BlockFmhaPipelineQRKSVSFp8

        // prefetch K tile
        index_t i_total_loops      = 0;
-        constexpr index_t k0_loops = kK0BlockLength / kK0;
+        constexpr index_t k0_loops = kQKHeaddim / kK0;
        constexpr index_t k1_loops = kN0 / kK1;

        static_assert(2 <= k0_loops);
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qs_ks_vs.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qs_ks_vs.hpp
@@ -36,12 +36,13 @@ struct [[deprecated]] BlockFmhaPipelineQSKSVS

    static constexpr index_t kBlockSize = Problem::kBlockSize;

-    static constexpr index_t kM0            = BlockFmhaShape::kM0;
-    static constexpr index_t kN0            = BlockFmhaShape::kN0;
-    static constexpr index_t kK0            = BlockFmhaShape::kK0;
-    static constexpr index_t kN1            = BlockFmhaShape::kN1;
-    static constexpr index_t kK1            = BlockFmhaShape::kK1;
-    static constexpr index_t kK0BlockLength = BlockFmhaShape::kK0BlockLength;
+    static constexpr index_t kM0           = BlockFmhaShape::kM0;
+    static constexpr index_t kN0           = BlockFmhaShape::kN0;
+    static constexpr index_t kK0           = BlockFmhaShape::kK0;
+    static constexpr index_t kN1           = BlockFmhaShape::kN1;
+    static constexpr index_t kK1           = BlockFmhaShape::kK1;
+    static constexpr index_t kQKHeaddim    = BlockFmhaShape::kQKHeaddim;
+    static constexpr index_t kSubQKHeaddim = BlockFmhaShape::kSubQKHeaddim;

    static constexpr bool kIsGroupMode = Problem::kIsGroupMode;
    static constexpr bool kPadSeqLenQ  = Problem::kPadSeqLenQ;
@@ -56,22 +57,22 @@ struct [[deprecated]] BlockFmhaPipelineQSKSVS
            return Problem::kBlockPerCu;
        else
        {
-            if constexpr(kK0BlockLength <= 32)
+            if constexpr(kQKHeaddim <= 32)
            {
                return 2;
            }
-            else if constexpr(kK0BlockLength <= 64)
+            else if constexpr(kQKHeaddim <= 64)
            {
                return 3;
            }
-            else if constexpr(kK0BlockLength <= 128)
+            else if constexpr(kQKHeaddim <= 128)
            {
                if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
                    return 1;
                else
                    return 2;
            }
-            else if constexpr(kK0BlockLength <= 256)
+            else if constexpr(kQKHeaddim <= 256)
            {
                return 1;
            }
@@ -235,7 +236,7 @@ struct [[deprecated]] BlockFmhaPipelineQSKSVS

        // prefetch K tile
        index_t i_total_loops      = 0;
-        constexpr index_t k0_loops = kK0BlockLength / kK0;
+        constexpr index_t k0_loops = kQKHeaddim / kK0;
        constexpr index_t k1_loops = kN0 / kK1;

        static_assert(2 <= k0_loops);
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
@@ -55,7 +55,7 @@ struct BlockFmhaPipelineQXCustomPolicy</* QLoadOnce = */ true>
        constexpr index_t MWarp = config.template at<1>();

        constexpr index_t kMPerBlock = Problem::BlockFmhaShape::kM0;
-        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kK0BlockLength;
+        constexpr index_t kKPerBlock = Problem::BlockFmhaShape::kSubQKHeaddim;

        constexpr index_t K2 = WG::kK / WG::WarpGemmAttribute::Impl::kABKLane;
        constexpr index_t K1 = WG::WarpGemmAttribute::Impl::kABKLane;
@@ -323,6 +323,9 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
    template<> struct
    LdsBufferSequence<3, 3, 3, 3> { using type = sequence<1, 2, 0,      1, 2, 0>; };

+    template<> struct
+    LdsBufferSequence<3, 3, 3, 4> { using type = sequence<1, 2, 0,      0, 1, 2, 0>; };
+
    template<> struct
    LdsBufferSequence<3, 3, 2, 2> { using type = sequence<1, 2,         1, 0>;};
    // clang-format on
@@ -332,12 +335,12 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
    {
        using BlockFmhaShape = remove_cvref_t<typename Problem::BlockFmhaShape>;

-        constexpr index_t kN0            = BlockFmhaShape::kN0;
-        constexpr index_t kK0            = BlockFmhaShape::kK0;
-        constexpr index_t kK1            = BlockFmhaShape::kK1;
-        constexpr index_t kK0BlockLength = BlockFmhaShape::kK0BlockLength;
+        constexpr index_t kN0        = BlockFmhaShape::kN0;
+        constexpr index_t kK0        = BlockFmhaShape::kK0;
+        constexpr index_t kK1        = BlockFmhaShape::kK1;
+        constexpr index_t kQKHeaddim = BlockFmhaShape::kQKHeaddim;

-        constexpr index_t k0_loops = kK0BlockLength / kK0;
+        constexpr index_t k0_loops = kQKHeaddim / kK0;
        constexpr index_t k1_loops = kN0 / kK1;

        return typename LdsBufferSequence<NumPrefetchK, NumPrefetchV, k0_loops, k1_loops>::type{};
--- a/include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/tile_fmha_shape.hpp
@@ -7,6 +7,20 @@

 namespace ck_tile {

+static CK_TILE_HOST_DEVICE constexpr index_t ceil_to_qualified_tile_length(index_t len)
+{
+    if(len == 96)
+        return 128;
+    if(len == 160)
+        return 256;
+
+    // only length of 96, 160 and power-of-two is supported
+    if(!(len & (len - 1)))
+        return len;
+
+    return 0;
+};
+
 template <typename BlockTile_, // sequence<...
          typename Gemm0BlockWarps_,
          typename Gemm0WarpTile_,
@@ -36,10 +50,12 @@ struct TileFmhaShape
    static constexpr index_t kK0 = BlockTile::at(number<2>{}); // tile size along qk gemm unroll
    static constexpr index_t kN1 = BlockTile::at(number<3>{}); // tile size along v head_dim
    static constexpr index_t kK1 = BlockTile::at(number<4>{}); // tile size along kv gemm unroll
-    static constexpr index_t kK0BlockLength =
+    static constexpr index_t kQKHeaddim =
        BlockTile::at(number<5>{}); // total length of K0, used for pipeline that need load Q at
                                    // once (or repeately load Q as a whole tile)
-    static_assert(kK0BlockLength % kK0 == 0, "kK0BlockLength should be divisible by kK0");
+    static_assert(kQKHeaddim % kK0 == 0, "kQKHeaddim should be divisible by kK0");
+
+    static constexpr index_t kSubQKHeaddim = ceil_to_qualified_tile_length(kQKHeaddim);

    // v, rowmajor : seqlen*hdim, colmajor : hdim*seqlen
    static constexpr bool IsVLayoutRowMajor = IsVLayoutRowMajor_;