Merge branch 'develop' into hstu_attention_n0loop_fused_unroll

2026-05-15 18:42:06 +00:00 · 2025-08-18 13:47:44 +00:00
parent fb09061b0c 26d3300930
commit 89cd5ff5fe
1876 changed files with 192746 additions and 21091 deletions
--- a/example/ck_tile/01_fmha/CMakeLists.txt
+++ b/example/ck_tile/01_fmha/CMakeLists.txt
@@ -1,7 +1,7 @@
 # validate user-specified fmha_fwd API list
-set(FMHA_FWD_KNOWN_APIS "fwd;fwd_splitkv;fwd_appendkv")
+set(FMHA_FWD_KNOWN_APIS "fwd;fwd_splitkv;fwd_appendkv;pagedkv_prefill")
 set(FMHA_FWD_ENABLE_APIS "fwd" CACHE STRING
-    "semicolon-separated list of APIs to generate (${FMHA_FWD_KNOWN_APIS}) & link, or \"all\".")
+  "semicolon-separated list of APIs to generate (${FMHA_FWD_KNOWN_APIS}) & link, or \"all\".")
 if(FMHA_FWD_ENABLE_APIS STREQUAL "all")
  set(FMHA_FWD_ENABLE_APIS ${FMHA_FWD_KNOWN_APIS})
 endif()
@@ -17,24 +17,45 @@ if(NOT "fwd" IN_LIST FMHA_FWD_ENABLE_APIS)
  list(APPEND FMHA_FWD_ENABLE_APIS "fwd")
 endif()

+file(GLOB_RECURSE CODE_GEN_SCRIPTS CONFIGURE_DEPENDS
+  ${CMAKE_CURRENT_LIST_DIR}/generate.py
+  ${CMAKE_CURRENT_LIST_DIR}/codegen/*.py
+)
+# re-run execute_process `generate.py --list_blobs` if any of the codegen scripts change
+set_directory_properties(PROPERTIES CMAKE_CONFIGURE_DEPENDS "${CODE_GEN_SCRIPTS}")
+
 string(REPLACE ";" "," FMHA_FWD_APIS "${FMHA_FWD_ENABLE_APIS}")
+set(FMHA_FWD_CODE_GEN_COMMON_ARGS
+  ${CMAKE_CURRENT_LIST_DIR}/generate.py
+  --api ${FMHA_FWD_APIS}
+  --optdim 32,64,128,256
+  # --filter fmha_fwd...
+)
+set(FMHA_BWD_CODE_GEN_COMMON_ARGS
+  ${CMAKE_CURRENT_LIST_DIR}/generate.py
+  --api bwd
+  --receipt 3
+  --optdim 32,64,128,256
+  # --filter fmha_bwd_dot...@fmha_bwd_convert...@fmha_bwd...
+)
+
 # generate a list of kernels, but not actually emit files at config sta
 execute_process(
-  COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/generate.py
-  --api ${FMHA_FWD_APIS} --list_blobs ${CMAKE_CURRENT_BINARY_DIR}/fwd_blob_list.txt
+  COMMAND ${Python3_EXECUTABLE} ${FMHA_FWD_CODE_GEN_COMMON_ARGS}
+  --list_blobs ${CMAKE_CURRENT_BINARY_DIR}/fwd_blob_list.txt
  RESULT_VARIABLE ret
 )
 if(ret AND NOT ret EQUAL 0)
-  message( FATAL_ERROR "CK Tile FMHA FAILED to genrate a list of FWD kernels via Python.")
+  message(FATAL_ERROR "CK Tile FMHA FAILED to genrate a list of FWD kernels via Python.")
 endif()

 execute_process(
-  COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/generate.py
-  --api bwd --list_blobs ${CMAKE_CURRENT_BINARY_DIR}/bwd_blob_list.txt --receipt 3
+  COMMAND ${Python3_EXECUTABLE} ${FMHA_BWD_CODE_GEN_COMMON_ARGS}
+  --list_blobs ${CMAKE_CURRENT_BINARY_DIR}/bwd_blob_list.txt
  RESULT_VARIABLE ret
 )
 if(ret AND NOT ret EQUAL 0)
-  message( FATAL_ERROR "CK Tile FMHA FAILED to genrate a list of BWD kernels via Python.")
+  message(FATAL_ERROR "CK Tile FMHA FAILED to genrate a list of BWD kernels via Python.")
 endif()

 # NOTE: for cmake, the FMHA_FWD_GEN_BLOBS/FMHA_BWD_GEN_BLOBS files must be in the same directory
@@ -44,20 +65,22 @@ file(STRINGS ${CMAKE_CURRENT_BINARY_DIR}/bwd_blob_list.txt FMHA_BWD_GEN_BLOBS)

 add_custom_command(
  OUTPUT ${FMHA_FWD_GEN_BLOBS}
-  COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/generate.py
-  --api ${FMHA_FWD_APIS} --output_dir ${CMAKE_CURRENT_BINARY_DIR}
+  COMMAND ${Python3_EXECUTABLE} ${FMHA_FWD_CODE_GEN_COMMON_ARGS}
+  --output_dir ${CMAKE_CURRENT_BINARY_DIR}
+  DEPENDS ${CODE_GEN_SCRIPTS}
 )

 add_custom_command(
  OUTPUT ${FMHA_BWD_GEN_BLOBS}
-  COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/generate.py
-  --api bwd --output_dir ${CMAKE_CURRENT_BINARY_DIR} --receipt 3
+  COMMAND ${Python3_EXECUTABLE} ${FMHA_BWD_CODE_GEN_COMMON_ARGS}
+  --output_dir ${CMAKE_CURRENT_BINARY_DIR}
+  DEPENDS ${CODE_GEN_SCRIPTS}
 )

 set(EXAMPLE_FMHA_FWD "tile_example_fmha_fwd")
 # not using add_example_executable() to add this target, since we don't want this to have
 # to be included in "make all/install/check"
-message("adding example ${EXAMPLE_FMHA_FWD}")
+message(DEBUG "adding example ${EXAMPLE_FMHA_FWD}")
 add_executable(${EXAMPLE_FMHA_FWD} EXCLUDE_FROM_ALL fmha_fwd.cpp)
 target_include_directories(${EXAMPLE_FMHA_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
 target_sources(${EXAMPLE_FMHA_FWD} PRIVATE ${FMHA_FWD_GEN_BLOBS})
@@ -65,7 +88,7 @@ target_sources(${EXAMPLE_FMHA_FWD} PRIVATE ${FMHA_FWD_GEN_BLOBS})
 set(EXAMPLE_FMHA_BWD "tile_example_fmha_bwd")
 # not using add_example_executable() to add this target, since we don't want this to have
 # to be included in "make all/install/check"
-message("adding example ${EXAMPLE_FMHA_BWD}")
+message(DEBUG "adding example ${EXAMPLE_FMHA_BWD}")
 add_executable(${EXAMPLE_FMHA_BWD} EXCLUDE_FROM_ALL fmha_bwd.cpp)
 target_include_directories(${EXAMPLE_FMHA_BWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
 target_sources(${EXAMPLE_FMHA_BWD} PRIVATE ${FMHA_BWD_GEN_BLOBS})
@@ -73,7 +96,7 @@ target_sources(${EXAMPLE_FMHA_BWD} PRIVATE ${FMHA_BWD_GEN_BLOBS})
 # NOTE: this is dangerous since will change the whole kernel to flush denormals
 #       WIP with compiler team for an exp2 intrinsic..., then remove this
 if(NOT DEFINED FMHA_FWD_FAST_EXP2)
-    set(FMHA_FWD_FAST_EXP2 true)
+  set(FMHA_FWD_FAST_EXP2 true)
 endif()

 set(EXAMPLE_FMHA_FWD_COMPILE_OPTIONS)
@@ -82,9 +105,9 @@ set(EXAMPLE_FMHA_BWD_COMPILE_OPTIONS)
 # NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
 #       ... because they are auto-generated
 if(FMHA_FWD_FAST_EXP2)
-	list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -DCK_TILE_FMHA_FWD_FAST_EXP2=1 -fgpu-flush-denormals-to-zero)
+  list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -DCK_TILE_FMHA_FWD_FAST_EXP2=1 -fgpu-flush-denormals-to-zero)
 else()
-	list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -DCK_TILE_FMHA_FWD_FAST_EXP2=0)
+  list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -DCK_TILE_FMHA_FWD_FAST_EXP2=0)
 endif()
 list(APPEND EXAMPLE_FMHA_BWD_COMPILE_OPTIONS -Wno-undefined-func-template -fgpu-flush-denormals-to-zero)

@@ -102,6 +125,13 @@ else()
  list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -DCK_TILE_FMHA_FWD_APPENDKV_API=0)
 endif()

+# conditionally enable call to the pagedkv_prefill API in fmha_fwd example
+if("pagedkv_prefill" IN_LIST FMHA_FWD_ENABLE_APIS)
+  list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -DCK_TILE_FMHA_FWD_PAGEDKV_API=1)
+else()
+  list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -DCK_TILE_FMHA_FWD_PAGEDKV_API=0)
+endif()
+
 # conditionally specify the use of OCP_FP8
 if(CK_USE_OCP_FP8)
  list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
--- a/example/ck_tile/01_fmha/README.md
+++ b/example/ck_tile/01_fmha/README.md
@@ -71,6 +71,7 @@ args:
  -drop_seed    seed for random number generator (default:1)
 -drop_offset    offset for random number generator (default:0)
 -drop_prefs    seed and offset values are present on GPU; 0 - host, 1 - device/GPU (default:0)
+ -num_splits    number of splits for key/value. 0 to determine actual number by heuristic (default:1)
     -warmup    number of iterations before benchmark the kernel (default:5)
     -repeat    number of iterations to benchmark the kernel (default:20)
 ```
--- a/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py
+++ b/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py
@@ -114,15 +114,22 @@ LAYOUT_MAP = {
 PIPELINE_MAP = {
    "qr" : "ck_tile::BlockFmhaPipelineQRKSVS",
    "qr_async" : "ck_tile::BlockFmhaPipelineQRKSVSAsync",
+    "qs" : "ck_tile::BlockFmhaPipelineQSKSVS",
+    "qr_async_trload" : "ck_tile::BlockFmhaPipelineQRKSVSAsyncTrload",
 }

 PIPELINE_ENUM_MAP = {
    "qr" : "ck_tile::BlockFmhaPipelineEnum::QRKSVS",
    "qr_async" : "ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC",
    "qr_nwarp_sshuffle" : "ck_tile::BlockFmhaPipelineEnum::QRKSVS",
+    "qs" : "ck_tile::BlockFmhaPipelineEnum::QSKSVS",
+    "qr_pagedkv" : "ck_tile::BlockFmhaPipelineEnum::QRKSVS",
+    "qr_async_trload" : "ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC_TRLOAD",
 }

 BOOL_MAP = {
    "t" : "true",
-    "f" : "false"
+    "f" : "false",
+    True : "true",
+    False : "false",
 }
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_batch_prefill.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_batch_prefill.py
@@ -0,0 +1,626 @@
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+# generate kernel instances to speed up compilation
+
+import copy
+from dataclasses import dataclass, field
+import fnmatch
+import itertools
+from pathlib import Path
+from typing import List, Optional, Tuple
+
+from codegen.cmake_config import *
+from codegen.cpp_symbol_map import *
+
+
+DTYPE_BITS = {
+    "fp32": 32,
+    "fp16": 16,
+    "bf16": 16,
+    "fp8" : 8,
+    "bf8" : 8
+}
+
+K0_MAX_SUBMAX_MAP = {
+    32 : 32,
+    64 : 64,
+    96 : 128,
+    128: 128,
+    256: 256
+}
+
+FMHA_BATCH_PREFILL_PIPELINE_MAP = {
+    "qr_async" : "ck_tile::BlockFmhaBatchPrefillPipelineQRKSVSAsync",
+}
+
+FMHA_FWD_KERNEL_HEADER = """// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.\n
+// auto generated by generate.py
+#include "ck_tile/ops/fmha/block/variants.hpp"
+#include "fmha_fwd.hpp"
+"""
+
+FMHA_FWD_KERNEL_BODY="""
+using fmha_dtype_{F_idx} = {F_dtype};
+
+using fmha_block_tile_{F_idx} = ck_tile::sequence<{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}>;
+
+using fmha_shape_{F_idx} = ck_tile::TileFmhaShape<fmha_block_tile_{F_idx},
+                                      ck_tile::sequence<{F_rm0}, {F_rn0}, {F_rk0}>,
+                                      ck_tile::sequence<{F_wm0}, {F_wn0}, {F_wk0}>,
+                                      ck_tile::sequence<{F_rm1}, {F_rn1}, {F_rk1}>,
+                                      ck_tile::sequence<{F_wm1}, {F_wn1}, {F_wk1}>,
+                                      {F_vlayout}>;
+
+using fmha_trait_{F_idx} = ck_tile::TileFmhaTraits<{F_spad},
+                                                    {F_skpad},
+                                                    {F_dpad},
+                                                    {F_dvpad},
+                                                    {F_logits},
+                                                    {F_bias},
+                                                    false,
+                                                    {F_lse},
+                                                    {F_dropout},
+                                                    {F_squant},
+                                                    {F_occupancy}>;
+
+using fmha_variant_{F_idx} = ck_tile::ComposedAttention<{F_logits} * ck_tile::LOGITS_SOFT_CAP, CK_TILE_FMHA_FWD_FAST_EXP2>;
+
+using fmha_mask_{F_idx} = {F_mask};
+
+using fmha_pipeline_problem_{F_idx} = ck_tile::BlockFmhaPipelineProblem<
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::QDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::KDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::VDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::SaccDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::SMPLComputeDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::BiasDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::RandValOutputDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::LSEDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::PDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::OaccDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::ODataType,
+    fmha_shape_{F_idx},
+    {F_mode},
+    fmha_variant_{F_idx},
+    fmha_mask_{F_idx},
+    false,
+    fmha_trait_{F_idx}>;
+
+using fmha_pipeline_{F_idx} = {F_pipeline}<
+    fmha_pipeline_problem_{F_idx}>;
+
+using fmha_epilogue_{F_idx} =
+    ck_tile::Default2DEpilogue<ck_tile::Default2DEpilogueProblem<typename FmhaFwdTypeConfig<{F_dtype}>::OaccDataType,
+                                           typename FmhaFwdTypeConfig<{F_dtype}>::ODataType,
+                                           {F_spad}, {F_dvpad}>>;
+
+using fmha_kernel_{F_idx} =
+    ck_tile::FmhaBatchPrefillWithPagedKVCacheKernel<fmha_pipeline_{F_idx}, fmha_epilogue_{F_idx}>;
+
+using trait_{F_idx} = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode},{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout},
+                        {F_pipeline_enum}, {F_logits}, fmha_mask_{F_idx}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, false>;
+
+#include <iostream>
+
+template<>
+float fmha_batch_prefill_<trait_{F_idx}>(const ck_tile::stream_config& s, fmha_batch_prefill_args a)
+{{
+    using k_ = fmha_kernel_{F_idx};
+    if(s.log_level_ > 0)
+        std::cout << ", " << k_::GetName() << std::flush;
+    auto [kargs, grids] = fmha_batch_prefill_create_kargs_and_grids<k_>(a);
+    constexpr dim3 blocks             = k_::BlockSize();
+    constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu;
+    return ck_tile::launch_kernel(s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(k_{{}}, grids, blocks, 0, kargs));
+}}
+"""
+
+FMHA_FWD_API_FILENAME="fmha_batch_prefill_api.cpp"
+FMHA_FWD_API="""
+#include <cstdio>
+
+namespace {{
+bool get_num_cus(unsigned& num_cu) {{
+    int device;
+    auto status = hipGetDevice(&device);
+    if(status != hipSuccess) {{
+        fprintf(stderr, "failed to get device");
+        return false;
+    }}
+
+    hipDeviceProp_t props{{}};
+    status = hipGetDeviceProperties(&props, device);
+    if(status != hipSuccess) {{
+        fprintf(stderr, "failed to get device properties");
+        return false;
+    }}
+
+    num_cu = props.multiProcessorCount;
+    return true;
+}}
+
+unsigned get_num_thread_blocks(unsigned batch, unsigned nheads, unsigned max_seqlen_q, unsigned kM0) {{
+    const unsigned num_m_blocks = (max_seqlen_q + kM0 - 1) / kM0;
+    const unsigned num_n_blocks = 1; // we assume that num_n_blocks is always 1
+
+    return batch * nheads * num_m_blocks * num_n_blocks;
+}}
+}} // namespace
+
+float fmha_batch_prefill(fmha_batch_prefill_traits t, fmha_batch_prefill_args a, const ck_tile::stream_config& s) {{
+    float r = -1;
+
+    [[maybe_unused]] const float min_cu_util_rate = 0.8; // minimum CU utilization rate
+
+    unsigned num_cus;
+    if (!get_num_cus(num_cus)) {{
+        return r;
+    }}
+
+    [[maybe_unused]] auto get_num_blocks = [&](unsigned kM0) {{
+        return get_num_thread_blocks(a.batch, a.nhead_q, a.max_seqlen_q, kM0);
+    }};
+
+{F_dispatch}
+    return r;
+}}
+"""
+
+FMHA_FWD_API_PER_DTYPE="""    {F_if}(t.data_type.compare(\"{F_dtype}\") == 0){{
+{F_hdim_case}
+    }}
+"""
+FMHA_FWD_API_PER_HDIM_CASE="""        {F_if} (t.hdim_q <= {F_hdim} && t.hdim_v <= {F_hdim_v}) {{
+{F_inner_dispatch}
+        }}
+"""
+
+FMHA_FWD_API_INNER_DISPATCH="""            {F_if}((t.is_group_mode == {F_mode}) && (t.is_v_rowmajor == {F_vlayout}) && (t.has_logits_soft_cap == {F_logits}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.has_lse == {F_lse})  && (t.has_dropout == {F_dropout}) && (t.do_fp8_static_quant == {F_squant}) &&
+                        ({F_scheck}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck}) && ({F_constraint})) {{
+                using trait_ = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, {F_logits}, {F_mask}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, false>;
+                return fmha_batch_prefill_<trait_>(s, a);
+            }}
+"""
+
+@dataclass
+class CppConstraint:
+    bool_expr: str = None
+
+    def __str__(self):
+        if self.bool_expr is None:
+            return 'true'
+        else:
+            return f'{self.bool_expr}'
+
+    def __and__(self, other):
+        return CppConstraint(f'({str(self)}) && ({str(other)})')
+
+@dataclass
+class FmhaFwdApiTrait:
+    pipeline_tag : str
+    # sync with fmha_fwd_traits<>, to generate fallback calls
+    hdim       : str
+    dtype      : str  # data type
+    mode       : str  # value from MODE_MAP
+    bm0        : int  # tile size along q seqlen (block size)
+    bn0        : int  # tile size along qk seqlen
+    bk0        : int  # tile size along qk gemm unroll
+    bn1        : int  # tile size along v head_dim
+    bk1        : int  # tile size along kv gemm unroll
+    bk0max     : int
+    vlayout    : str
+    logits     : str
+    mask       : str
+    bias       : str  #
+    lse        : str  #
+    dropout    : str
+    squant     : str  #
+    spad       : str
+    skpad      : str
+    dpad       : str
+    dvpad      : str
+    constraint : CppConstraint
+
+    @property
+    def name(self) -> str:
+        return f'{self.hdim}-{self.dtype}-{self.mode}-{self.bm0}-{self.bn0}-{self.bk0}-{self.bn0}-{self.bk1}-{self.bk0max}-'+\
+                    f'{self.vlayout}-{self.logits}-{self.mask}-{self.bias}-{self.lse}-{self.dropout}-{self.squant}-{self.spad}-{self.skpad}-{self.dpad}-{self.dvpad}'
+
+    @property
+    def scheck(self) -> str:
+        if self.mode == 'group': return 'true/*group mode spad always true*/'                  # group mode only generate spad/skpad == true
+        if self.pipeline_tag == 'qr_async':
+            if self.spad == 't' : return 'true' # always support
+            else :                return 'true'
+        elif self.pipeline_tag in ['qr']:
+            if self.spad == 't' : return f'true /*a.seqlen_q % {self.bm0} != 0*/'  # TODO: order of get_pipelines() matters! (ugly)
+            else :                return f'a.seqlen_q % {self.bm0} == 0'
+        else: assert False
+
+    @property
+    def skcheck(self) -> str:
+        if self.mode == 'group': return 'true/*group mode skpad always true*/'                  # group mode only generate spad/skpad == true
+        if self.pipeline_tag == 'qr_async':
+            if self.skpad == 't' : return f'a.seqlen_k == 0 || a.seqlen_k % {self.bn0} != 0'
+            else :                 return f'a.seqlen_k != 0 && a.seqlen_k % {self.bn0} == 0'
+        elif self.pipeline_tag in ['qr', 'qr_fp8']:
+            if self.skpad == 't' : return f'true /*a.seqlen_k % {self.bn0} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
+            else :                return f'a.seqlen_k % {self.bn0} == 0'
+        else: assert False
+
+    @property
+    def dcheck(self) -> str:
+        if self.pipeline_tag == 'qr_async':
+            vec = int((32 * 4) / DTYPE_BITS[self.dtype])
+            if self.dpad == 't': return f'a.hdim_q % {vec} == 0'
+            else :               assert False
+        elif self.pipeline_tag in ['qr']:
+            bk0submax = K0_MAX_SUBMAX_MAP[self.bk0max]
+            if self.dpad == 't': return f'true /*a.hdim_q % {bk0submax} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
+            else :               return f'a.hdim_q % {bk0submax} == 0'
+        else:   assert False
+
+    @property
+    def dvcheck(self) -> str:
+        if self.pipeline_tag == 'qr_async':
+            vec = int((32 * 4) / DTYPE_BITS[self.dtype])
+            if self.dvpad == 't': return f'a.hdim_v % {vec} == 0'
+            else :                assert False
+        elif self.pipeline_tag in ['qr']:
+            bk0submax = K0_MAX_SUBMAX_MAP[self.bk0max]
+            if self.dvpad == 't': return f'true /*a.hdim_v % {bk0submax} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
+            else :                return f'a.hdim_v % {bk0submax} == 0'
+        else:   assert False
+
+@dataclass
+class FmhaFwdPipeline:
+    tag : str
+
+    F_vlayout    : str  # row/col
+    F_spad       : str  # true/false
+    F_skpad      : str  #
+    F_dpad       : str  #
+    F_dvpad      : str  #
+    F_logits     : str  # t/f
+    F_bias       : str  # true/false
+    F_lse        : str  #
+    F_dropout    : str  #
+    F_squant     : str  #
+    F_mask       : str  # value from MASK_MAP
+    F_constraint : CppConstraint = field(default_factory=lambda: CppConstraint())
+
+    @property
+    def name(self) -> str:
+        def pad_name() -> str:
+            n = ''
+            if self.F_spad == 't': n += 's'
+            if self.F_skpad == 't' : n += 'sk'
+            if self.F_dpad == 't' : n += 'd'
+            if self.F_dvpad == 't' : n += 'dv'
+            if n != '' : n = 'p' + n
+            return n
+        pn = pad_name()
+        n = f'{self.tag}_v{self.F_vlayout[0]}'
+        if pn != '' : n += f'_{pn}'
+        else: n += '_npad'
+
+        if self.F_logits == 't' : n += '_logits'
+        else: n += '_nlogits'
+
+        if self.F_bias != 'no' : n += f'_{self.F_bias}'
+        else: n += '_nbias'
+
+        if self.F_mask[0:2] == 's_':
+            if self.F_mask == 's_mask': n += f'_mask'
+            else: n += '_nmask'
+        else:
+            if self.F_mask != 'no' : n += f'_m{self.F_mask[0]}'
+            else: n += '_nmask'
+
+        if self.F_lse == 't' : n += '_lse'
+        else: n += '_nlse'
+
+        if self.F_dropout == 't' : n += '_dropout'
+        else: n += '_ndropout'
+
+        if self.F_squant == 't' : n += '_squant'
+        else: n += '_nsquant'
+        return n
+
+class FmhaFwdApiPool:
+    def __init__(self, mask_impl):
+        self.pool = dict()
+        self.mask_impl = mask_impl
+
+    def register_traits(self, trait : FmhaFwdApiTrait) -> None:
+        # TODO: do we need to check duplication?
+        if trait.dtype not in self.pool.keys():
+            self.pool[trait.dtype] = dict()
+        if trait.hdim not in self.pool[trait.dtype].keys():
+            self.pool[trait.dtype][trait.hdim] = list()
+
+        self.pool[trait.dtype][trait.hdim].append(copy.copy(trait))
+
+    @property
+    def api(self) -> str:
+        per_dtypes=str()
+        for i, dtype in enumerate(self.pool.keys()):
+            per_hdim_case=str()
+            for j, hdim in enumerate(self.pool[dtype].keys()):
+                traits=self.pool[dtype][hdim]
+                inners=str()
+                for k, trait in enumerate(traits):
+                    if_k = 'if' if k == 0 else 'else if'
+                    inners = inners + FMHA_FWD_API_INNER_DISPATCH.format(F_if=if_k, F_mode=MODE_MAP[trait.mode], F_vlayout=LAYOUT_MAP[trait.vlayout],
+                                   F_pipeline_enum=PIPELINE_ENUM_MAP[trait.pipeline_tag], F_logits=BOOL_MAP[trait.logits], F_mask=get_mask_map(self.mask_impl)[trait.mask],
+                                   F_mask_check=get_mask_check_map(self.mask_impl)[trait.mask], F_bias_check=BIAS_CHECK_MAP[trait.bias], F_bias=BIAS_MAP[trait.bias],
+                                   F_lse=BOOL_MAP[trait.lse], F_dropout=BOOL_MAP[trait.dropout], F_squant=BOOL_MAP[trait.squant],
+                                   F_scheck=trait.scheck, F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck, F_constraint=trait.constraint,
+                                   F_spad=BOOL_MAP[trait.spad], F_skpad=BOOL_MAP[trait.skpad], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad],
+                                   F_bm0=trait.bm0, F_bn0=trait.bn0, F_bk0=trait.bk0, F_bn1=trait.bn1, F_bk1=trait.bk1, F_bk0max=trait.bk0max,
+                                   F_hdim=hdim, F_dtype=FWD_DTYPE_MAP[dtype])
+                if_j = 'if' if j == 0 else 'else if'
+                per_hdim_case = per_hdim_case + FMHA_FWD_API_PER_HDIM_CASE.format(F_if=if_j, F_hdim=hdim, F_hdim_v=trait.bn1, F_inner_dispatch=inners)
+            if_i = 'if' if i == 0 else 'else if'
+            per_dtypes = per_dtypes + FMHA_FWD_API_PER_DTYPE.format(F_if=if_i, F_dtype=dtype, F_hdim_case=per_hdim_case)
+        if not per_dtypes:
+            # empty string we add some ignore to suppress warning in api
+            per_dtypes += '    (void)t ; (void)s ; (void)a;'
+        return FMHA_FWD_KERNEL_HEADER + FMHA_FWD_API.format(F_dispatch = per_dtypes)
+
+@dataclass
+class FmhaFwdTileSize:
+    F_bm0        : int  # tile size along q seqlen (block size)
+    F_bn0        : int  # tile size along k seqlen
+    F_bk0        : int  # tile size along qk gemm unroll
+    F_bn1        : int  # tile size along v head_dim
+    F_bk1        : int  # tile size along kv gemm unroll
+    F_bk0max     : int  # total length of K0, used for pipeline that need load Q at once (or repeately load Q as a whole tile)
+    F_rm0        : int  # number of warps for gemm0 along q seqlen
+    F_rn0        : int  # number of warps for gemm0 along k seqlen
+    F_rk0        : int  # number of warps for gemm0 along head dim q (not used)
+    F_rm1        : int  # number of warps for gemm1 along q seqlen
+    F_rn1        : int  # number of warps for gemm1 along head dim v
+    F_rk1        : int  # number of warps for gemm1 along k seqlen (not used)
+    F_wm0        : int  # gemm0 warp size along m
+    F_wn0        : int  # gemm0 warp size along n
+    F_wk0        : int  # gemm0 warp size along k
+    F_wm1        : int  # gemm1 warp size along m
+    F_wn1        : int  # gemm1 warp size along n
+    F_wk1        : int  # gemm1 warp size along k
+    F_occupancy  : int  # occupancy, -1 will let pipeline decide the occupancy, other value will overwrite occupancy
+    F_constraint : CppConstraint = field(default_factory=lambda: CppConstraint())
+
+    @property
+    def name(self) -> str:
+        return f"b{self.F_bm0}x{self.F_bn0}x{self.F_bk0}x{self.F_bn1}x{self.F_bk1}x{self.F_bk0max}" +\
+        f"_r{self.F_rm0}x{self.F_rn0}x{self.F_rk0}_r{self.F_rm1}x{self.F_rn1}x{self.F_rk1}" +\
+        f"_w{self.F_wm0}x{self.F_wn0}x{self.F_wk0}_w{self.F_wm1}x{self.F_wn1}x{self.F_wk1}" +\
+        ("" if self.F_occupancy == -1 else f"_o{self.F_occupancy}")
+
+@dataclass
+class FmhaFwdKernel:
+    F_idx           : int  # this is not a tunable, but a counter to differentiate symbol
+    F_hdim          : int  # hdim
+    F_dtype         : str  # data type
+    F_mode          : str  # value from MODE_MAP
+    F_tile          : FmhaFwdTileSize
+    F_pipeline      : FmhaFwdPipeline
+    mask_impl       : str
+
+    @property
+    def template(self) -> str:
+        kernel_body = str()
+        return FMHA_FWD_KERNEL_HEADER + \
+            FMHA_FWD_KERNEL_BODY.format(
+                F_idx           = self.F_idx,
+                F_hdim          = self.F_hdim,
+                F_dtype         = FWD_DTYPE_MAP[self.F_dtype],
+                F_bm0           = self.F_tile.F_bm0,
+                F_bn0           = self.F_tile.F_bn0,
+                F_bk0           = self.F_tile.F_bk0,
+                F_bn1           = self.F_tile.F_bn1,
+                F_bk1           = self.F_tile.F_bk1,
+                F_bk0max        = self.F_tile.F_bk0max,
+                F_rm0           = self.F_tile.F_rm0,
+                F_rn0           = self.F_tile.F_rn0,
+                F_rk0           = self.F_tile.F_rk0,
+                F_rm1           = self.F_tile.F_rm1,
+                F_rn1           = self.F_tile.F_rn1,
+                F_rk1           = self.F_tile.F_rk1,
+                F_wm0           = self.F_tile.F_wm0,
+                F_wn0           = self.F_tile.F_wn0,
+                F_wk0           = self.F_tile.F_wk0,
+                F_wm1           = self.F_tile.F_wm1,
+                F_wn1           = self.F_tile.F_wn1,
+                F_wk1           = self.F_tile.F_wk1,
+                F_vlayout       = LAYOUT_MAP[self.F_pipeline.F_vlayout],
+                F_spad          = BOOL_MAP[self.F_pipeline.F_spad],
+                F_skpad         = BOOL_MAP[self.F_pipeline.F_skpad],
+                F_dpad          = BOOL_MAP[self.F_pipeline.F_dpad],
+                F_dvpad         = BOOL_MAP[self.F_pipeline.F_dvpad],
+                F_logits        = BOOL_MAP[self.F_pipeline.F_logits],
+                F_bias          = BIAS_MAP[self.F_pipeline.F_bias],
+                F_lse           = BOOL_MAP[self.F_pipeline.F_lse],
+                F_dropout       = BOOL_MAP[self.F_pipeline.F_dropout],
+                F_squant        = BOOL_MAP[self.F_pipeline.F_squant],
+                F_occupancy     = self.F_tile.F_occupancy,
+                F_pipeline_enum = PIPELINE_ENUM_MAP[self.F_pipeline.tag],
+                F_mask          = get_mask_map(self.mask_impl)[self.F_pipeline.F_mask],
+                F_mode          = MODE_MAP[self.F_mode],
+                F_pipeline      = FMHA_BATCH_PREFILL_PIPELINE_MAP[self.F_pipeline.tag])
+
+    @property
+    def name(self) -> str:
+        # TODO: we don't encode idx here
+        return f"fmha_batch_prefill_d{self.F_hdim}_{self.F_dtype}_{self.F_mode}_" + \
+                self.F_tile.name + '_' + self.F_pipeline.name
+
+    @property
+    def filename(self) -> str:
+        return self.name + ".cpp"
+
+    def api_trait(self) -> FmhaFwdApiTrait:
+        return FmhaFwdApiTrait(
+                pipeline_tag=self.F_pipeline.tag,
+                hdim=str(self.F_hdim),
+                dtype=self.F_dtype,
+                mode=self.F_mode,
+                bm0=self.F_tile.F_bm0,
+                bn0=self.F_tile.F_bn0,
+                bk0=self.F_tile.F_bk0,
+                bn1=self.F_tile.F_bn1,
+                bk1=self.F_tile.F_bk1,
+                bk0max=self.F_tile.F_bk0max,
+                vlayout=self.F_pipeline.F_vlayout,
+                mask=self.F_pipeline.F_mask,
+                logits=self.F_pipeline.F_logits,
+                bias=self.F_pipeline.F_bias,
+                lse=self.F_pipeline.F_lse,
+                dropout=self.F_pipeline.F_dropout,
+                squant=self.F_pipeline.F_squant,
+                spad=self.F_pipeline.F_spad,
+                skpad=self.F_pipeline.F_skpad,
+                dpad=self.F_pipeline.F_dpad,
+                dvpad=self.F_pipeline.F_dvpad,
+                constraint=self.F_tile.F_constraint & self.F_pipeline.F_constraint)
+
+class KernelComponentFactory:
+    @staticmethod
+    def get_hdim_tile_size_dict(dtype : str) -> Optional[dict]:
+        if dtype == 'fp16' or dtype == 'bf16':
+            return {
+                128 : [FmhaFwdTileSize(128, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
+            }
+        else:
+            return None
+
+    @staticmethod
+    def get_pipelines(dtype, hdim, receipt, mask_impl) -> List[FmhaFwdPipeline]:
+        # this function will populate a list possible pipelines
+        # TODO: the order of List matters! the later in this list will be also be checked later
+        # TODO: currently for qr pipeline, let 't' padding to appear later!!
+        # TODO: how to design this more generic?
+        squant = 't' if dtype == 'fp8' else 'f'
+        pipelines = []
+        if dtype in ['fp16', 'bf16']:
+            for logits, mask, bias, lse, dropout in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"], ["t", "f"]):
+                    pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask))
+                    pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask))
+                    # pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask))
+                    # pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask))
+        else:
+            assert False
+        return pipelines
+
+class CustomFactory(KernelComponentFactory):
+    @staticmethod
+    def get_hdim_tile_size_dict(dtype : str) -> Optional[dict]:
+        result = KernelComponentFactory.get_hdim_tile_size_dict(dtype)
+        if dtype == 'fp16' or dtype == 'bf16':
+            if 128 in result.keys():
+                result[128].insert(0, FmhaFwdTileSize( 64, 128, 64, 128, 64,  128,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1, CppConstraint('get_num_blocks(128) < num_cus * min_cu_util_rate')))
+        return result
+
+def get_fwd_blobs(kernel_filter : Optional[str], receipt, optdim_list, mask_impl) -> Tuple[FmhaFwdApiPool, List[FmhaFwdKernel]]:
+    # TODO: we don't support tuning yet, so pick up one value for vlayout/pipeline/pad
+    #       support this in future
+
+    gen = list()
+    api_pool = FmhaFwdApiPool(mask_impl)
+
+    for dtype in FWD_DTYPE_MAP.keys():
+        d = CustomFactory.get_hdim_tile_size_dict(dtype)
+        if d == None:
+            continue
+        #for hdim_str, mode, mask, bias, lse in itertools.product(d.keys(), MODE_MAP.keys(), MASK_MAP.keys(), ["t", "f"], ["t", "f"]):
+        for (hdim, tiles), mode in itertools.product(d.items(), MODE_MAP.keys()):
+            for tile, pipeline in itertools.product(tiles, CustomFactory.get_pipelines(dtype, hdim, receipt, mask_impl)):
+                if mode == "group":
+                    if pipeline.F_spad != 't' or pipeline.F_skpad != 't':
+                        # in group mode, spad/skpad must be true, since we can't predict if seqlen of current batch need pad or not
+                        continue
+                if hdim == 192 and tile.F_bn1 == 128:
+                    # NOTE: this is used to speedup deepseek prefill case, we don't gen training
+                    if pipeline.F_bias != 'no' or pipeline.F_lse == 't' or pipeline.F_dropout == 't':
+                        continue
+                # logits_soft_cap is only allowed if no bias
+                if not ((pipeline.F_logits == 't' and pipeline.F_bias == 'no') or pipeline.F_logits == 'f'):
+                    continue
+                k = FmhaFwdKernel(F_idx=0,
+                                  F_hdim=hdim,
+                                  F_dtype=dtype,
+                                  F_mode=mode,
+                                  F_tile=tile,
+                                  F_pipeline=pipeline,
+                                  mask_impl=mask_impl)
+                if kernel_filter != '':
+                    if not fnmatch.fnmatch(k.name, kernel_filter):
+                        continue
+                if optdim_list != [-1]:
+                    if hdim not in optdim_list:
+                        continue
+                # 2 - Flash attention integration
+                if receipt in (2, 3):
+                    cond = dtype in ['fp16', 'bf16']
+                    cond &= pipeline.F_vlayout == 'row'
+                    cond &= pipeline.F_bias in ['no', 'alibi']
+                    cond &= pipeline.F_squant == 'f'
+                    if not cond:
+                        continue
+                # PyTorch integration
+                elif receipt == 4:
+                    cond = dtype in ['fp16', 'bf16']
+                    cond &= pipeline.F_vlayout == 'row'
+                    cond &= pipeline.F_bias in ['no', 'bias']
+                    cond &= pipeline.F_squant == 'f'
+                    if not cond:
+                        continue
+                # Aiter(mha_fwd) integration
+                elif receipt == 100:
+                    cond = dtype in ['fp16', 'bf16']
+                    cond &= mode == 'batch'
+                    cond &= pipeline.F_vlayout == 'row'
+                    cond &= pipeline.F_squant == 'f'
+                    if not cond:
+                        continue
+                # Aiter(mha_batch_prefill) integration
+                elif receipt == 200:
+                    cond = dtype in ['fp16', 'bf16']
+                    cond &= mode == 'group'
+                    cond &= pipeline.F_vlayout == 'row'
+                    cond &= pipeline.F_squant == 'f'
+                    if not cond:
+                        continue
+                # aiter::mha_batch_prefill C++ api integration
+                elif receipt == 600:
+                    cond = dtype in ['fp16', 'bf16']
+                    cond &= mode == 'group'
+                    cond &= pipeline.F_vlayout == 'row'
+                    cond &= pipeline.F_squant == 'f'
+                    if not cond:
+                        continue
+                api_pool.register_traits(k.api_trait())
+                gen.append(k)
+
+    return (api_pool, gen)
+
+def write_single_fwd_kernel(kernel: FmhaFwdKernel, autogen_dir: Path) -> None:
+    (autogen_dir / kernel.filename).write_text(kernel.template)
+
+def write_fwd_api(api_pool : FmhaFwdApiPool, autogen_dir: Path) -> None:
+    (autogen_dir / FMHA_FWD_API_FILENAME).write_text(api_pool.api)
+
+def write_blobs(output_dir : Path, kernel_filter : str, receipt, optdim_list, mask_impl) -> None:
+    api_pool, kernels = get_fwd_blobs(kernel_filter, receipt, optdim_list, mask_impl)
+    for kernel in kernels:
+        write_single_fwd_kernel(kernel, output_dir)
+    write_fwd_api(api_pool, output_dir)
+
+def list_blobs(file_path : Path, kernel_filter : str, receipt, optdim_list, mask_impl) -> None:
+    with file_path.open('a') as f:
+        _, kernels = get_fwd_blobs(kernel_filter, receipt, optdim_list, mask_impl)
+        for kernel in kernels:
+            f.write(str(file_path.parent / GEN_DIR / kernel.filename) + "\n")
+        f.write(str(file_path.parent / GEN_DIR / FMHA_FWD_API_FILENAME) + "\n")
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 # generate kernel instances to speed up compilation

 import copy
@@ -7,22 +7,14 @@ from dataclasses import dataclass
 import fnmatch
 import itertools
 from pathlib import Path
-from typing import List, Optional, Tuple
+from typing import List, Tuple, Dict, Literal, Any
+from collections import defaultdict

 from codegen.cmake_config import *
 from codegen.cpp_symbol_map import *
+from codegen.utils import update_file


-BWD_DQDKDV_PIPELINE_MAP = {
-    "kr_ktr_vr_iglp" : "ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVRIGLP",
-    "kr_ktr_vr"      : "ck_tile::BlockFmhaBwdDQDKDVPipelineKRKTRVR",
-}
-
-BWD_DQDKDV_PIPELINE_ENUM_MAP = {
-    "kr_ktr_vr_iglp" : "ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR_IGLP",
-    "kr_ktr_vr"      : "ck_tile::BlockFmhaBwdPipelineEnum::KRKTRVR",
-}
-
 FMHA_BWD_KERNEL_HEADER = """// SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.\n
 // auto generated by generate.py
@@ -39,6 +31,7 @@ using fmha_block_warps1_{F_idx} = ck_tile::sequence<{F_rm1}, {F_rn1}, {F_rk1}>;
 using fmha_block_warps2_{F_idx} = ck_tile::sequence<{F_rm2}, {F_rn2}, {F_rk2}>;
 using fmha_warp_tile0_{F_idx}   = ck_tile::sequence<{F_wm0}, {F_wn0}, {F_wk0}>;
 using fmha_warp_tile1_{F_idx}   = ck_tile::sequence<{F_wm1}, {F_wn1}, {F_wk1}>;
+using fmha_warp_tile2_{F_idx}   = ck_tile::sequence<{F_wm0}, {F_wn0}, ck_tile::min({F_wk0}, {F_bk4})>;

 // TODO: simplify Gemm0~4BlockWarps in TileFmhaBwdShape
 //       G0&G2 -> GSdP
@@ -54,12 +47,14 @@ using fmha_bwd_shape_{F_idx} = ck_tile::TileFmhaBwdShape<fmha_block_tile_{F_idx}
                                                         fmha_block_warps1_{F_idx},
                                                         fmha_warp_tile1_{F_idx},
                                                         fmha_block_warps2_{F_idx},
-                                                         fmha_warp_tile0_{F_idx}>;
+                                                         fmha_warp_tile2_{F_idx},
+                                                         {F_maxq}>;

-using fmha_bwd_trait_{F_idx} = ck_tile::TileFmhaTraits<{F_spad},
-                                                       {F_skpad},
+using fmha_bwd_trait_{F_idx} = ck_tile::TileFmhaTraits<false,  /* kPadSeqLenQ */
+                                                       false,  /* kPadSeqLenK */
                                                       {F_dpad},
                                                       {F_dvpad},
+                                                       false,
                                                       {F_bias},
                                                       {F_dbias},
                                                       false,
@@ -90,40 +85,47 @@ using fmha_bwd_pipeline_problem_{F_idx} = ck_tile::BlockFmhaBwdPipelineProblem<
    {F_deterministic},
    fmha_mask_{F_idx},
    fmha_dropout_{F_idx},
+    {F_trload},
    fmha_bwd_trait_{F_idx}>;

-using fmha_bwd_pipeline_{F_idx} = {F_pipeline}<fmha_bwd_pipeline_problem_{F_idx}>;
+using fmha_bwd_pipeline_{F_idx} = ck_tile::BlockFmhaBwdDQDKDVPipeline<fmha_bwd_pipeline_problem_{F_idx}>;

 using fmha_bwd_dk_epilogue_{F_idx} = ck_tile::Default2DEpilogue<
    ck_tile::Default2DEpilogueProblem<typename FmhaBwdTypeConfig<{F_dtype}>::AccDataType,
                                      typename FmhaBwdTypeConfig<{F_dtype}>::KGradDataType,
-                                      {F_skpad},
+                                      false,
                                      {F_dpad}>>;

 using fmha_bwd_dv_epilogue_{F_idx} = ck_tile::Default2DEpilogue<
    ck_tile::Default2DEpilogueProblem<typename FmhaBwdTypeConfig<{F_dtype}>::AccDataType,
                                      typename FmhaBwdTypeConfig<{F_dtype}>::VGradDataType,
-                                      {F_skpad},
+                                      false,
                                      {F_dvpad}>>;

+using fmha_bwd_dq_epilogue_{F_idx} = ck_tile::Default2DEpilogue<
+    ck_tile::Default2DEpilogueProblem<typename FmhaBwdTypeConfig<{F_dtype}>::AccDataType,
+                                      typename FmhaBwdTypeConfig<{F_dtype}>::QGradDataType,
+                                      false,
+                                      {F_dpad}>>;
+
 using fmha_bwd_dq_dk_dv_kernel_{F_idx} =
    ck_tile::FmhaBwdDQDKDVKernel<fmha_bwd_pipeline_{F_idx},
                                 fmha_bwd_dk_epilogue_{F_idx},
-                                 fmha_bwd_dv_epilogue_{F_idx}>;
+                                 fmha_bwd_dv_epilogue_{F_idx},
+                                 fmha_bwd_dq_epilogue_{F_idx}>;

 using dq_dk_dv_trait_{F_idx} = fmha_bwd_dq_dk_dv_traits_<{F_hdim},
                                                         {F_dtype},
                                                         {F_mode},
-                                                         {F_pipeline_enum},
                                                         fmha_mask_{F_idx},
                                                         fmha_dropout_{F_idx},
                                                         {F_bias},
                                                         {F_dbias},
-                                                         {F_spad},
-                                                         {F_skpad},
                                                         {F_dpad},
                                                         {F_dvpad},
-                                                         {F_deterministic}>;
+                                                         {F_deterministic},
+                                                         {F_trload},
+                                                         {F_maxq}>;

 #include <iostream>

@@ -152,6 +154,13 @@ void fmha_bwd_dq_dk_dv_oneshot_<dq_dk_dv_trait_{F_idx}>(const ck_tile::stream_co
        ck_tile::stream_config{{s.stream_id_}});
 }}

+template <>
+int fmha_bwd_dq_dk_dv_maxq_<dq_dk_dv_trait_{F_idx}>()
+{{
+    using k_ = fmha_bwd_dq_dk_dv_kernel_{F_idx};
+    return k_::kMaxSeqLenQ;
+}}
+
 template <>
 std::string fmha_bwd_dq_dk_dv_get_name_<dq_dk_dv_trait_{F_idx}>()
 {{
@@ -167,135 +176,59 @@ FMHA_BWD_API="""
 template <typename dot_do_o_trait_, typename dq_dk_dv_trait_, typename convert_dq_trait_>
 float fmha_bwd_(const ck_tile::stream_config& s, fmha_bwd_args a)
 {{
-    if(s.log_level_ > 0)
-        std::cout << ", " << fmha_bwd_dot_do_o_get_name_<dot_do_o_trait_>() << ", " << fmha_bwd_dq_dk_dv_get_name_<dq_dk_dv_trait_>() << ", " << fmha_bwd_convert_dq_get_name_<convert_dq_trait_>() << std::flush;
-    return ck_tile::launch_kernel(s,
-        [=](const ck_tile::stream_config& s_){{ fmha_bwd_dot_do_o_oneshot_<dot_do_o_trait_>(s_, a); }},
-        [=](const ck_tile::stream_config& s_){{ fmha_bwd_dq_dk_dv_oneshot_<dq_dk_dv_trait_>(s_, a); }},
-        [=](const ck_tile::stream_config& s_){{ fmha_bwd_convert_dq_oneshot_<convert_dq_trait_>(s_, a); }}
-    );
+    if constexpr (!std::is_same_v<convert_dq_trait_, void>)
+    {{
+        if(s.log_level_ > 0)
+            std::cout << ", " << fmha_bwd_dot_do_o_get_name_<dot_do_o_trait_>() << "@" << fmha_bwd_convert_dq_get_name_<convert_dq_trait_>() << "@" << fmha_bwd_dq_dk_dv_get_name_<dq_dk_dv_trait_>() << std::flush;
+        return ck_tile::launch_kernel(s,
+            [=](const ck_tile::stream_config& s_){{ fmha_bwd_dot_do_o_oneshot_<dot_do_o_trait_>(s_, a); }},
+            [=](const ck_tile::stream_config& s_){{ fmha_bwd_dq_dk_dv_oneshot_<dq_dk_dv_trait_>(s_, a); }},
+            [=](const ck_tile::stream_config& s_){{ fmha_bwd_convert_dq_oneshot_<convert_dq_trait_>(s_, a); }}
+        );
+    }}
+    else
+    {{
+        if(s.log_level_ > 0)
+            std::cout << ", " << fmha_bwd_dot_do_o_get_name_<dot_do_o_trait_>() << "@" << fmha_bwd_dq_dk_dv_get_name_<dq_dk_dv_trait_>() << std::flush;
+        return ck_tile::launch_kernel(s,
+            [=](const ck_tile::stream_config& s_){{ fmha_bwd_dot_do_o_oneshot_<dot_do_o_trait_>(s_, a); }},
+            [=](const ck_tile::stream_config& s_){{ fmha_bwd_dq_dk_dv_oneshot_<dq_dk_dv_trait_>(s_, a); }}
+        );
+    }}
 }}

 template <>
 float fmha_bwd<2>(fmha_bwd_traits t, fmha_bwd_args a, const ck_tile::stream_config& s){{
+    const bool has_load_tr = ck_tile::is_load_tr_supported();
    float r = -1;
 {F_dispatch}
    return r;
 }}
 """

-FMHA_BWD_API_PER_DTYPE="""    {F_if}(t.data_type.compare(\"{F_dtype}\") == 0){{
-{F_hdim_case}
-    }}
-"""
-FMHA_BWD_API_PER_HDIM_CASE="""        {F_if} (t.hdim_q <= {F_hdim} && t.hdim_v <= {F_hdim}) {{
-{F_inner_dispatch}
-        }}
+def FMHA_BWD_API_COND_STATEMENT(F_cond: str, F_body: str, *, indent=0, if_ = 0) -> str:
+    lines = [
+        f"{'if' if if_ == 0 else 'else if'}({F_cond})",
+        "{",
+        *['    ' + line for line in F_body.split('\n') if line.strip() != ''],
+        "}",
+    ]
+    return '\n'.join(' ' * indent + line for line in lines) + '\n'
+
+
+FMHA_BWD_API_INNER_DISPATCH="""
+{F_if}((t.is_group_mode == {F_mode}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.has_dbias == {F_dbias}) && ({F_dropout_check}) &&
+        ({F_scheck}) && ({F_dcheck}) && ({F_dvcheck}) && (t.is_deterministic == {F_deterministic})) {{
+    using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_spad1d}, {F_dvpad}>;
+    using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_mask}, {F_dropout}, {F_bias}, {F_dbias}, {F_dpad}, {F_dvpad}, {F_deterministic}, {F_trload}, {F_maxq}>;
+    using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_spad1d}, {F_dpad}, {F_deterministic}>;
+    r = fmha_bwd_<dot_do_o_trait_, dq_dk_dv_trait_, std::conditional_t<{F_convert_dq_enabled}, convert_dq_trait_, void>>(s, a);
+    return r;
+}}
 """

-FMHA_BWD_API_INNER_DISPATCH="""            {F_if}((t.is_group_mode == {F_mode}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.has_dbias == {F_dbias}) && ({F_dropout_check}) &&
-                        ({F_scheck}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck}) && (t.is_deterministic == {F_deterministic})) {{
-                using dot_do_o_trait_ = fmha_bwd_dot_do_o_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_spad1}, {F_dvpad}>;
-                using dq_dk_dv_trait_ = fmha_bwd_dq_dk_dv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_pipeline_enum}, {F_mask}, {F_dropout}, {F_bias}, {F_dbias}, {F_spad0}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_deterministic}>;
-                using convert_dq_trait_ = fmha_bwd_convert_dq_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_spad1}, {F_dpad}, {F_deterministic}>;
-                r = fmha_bwd_<dot_do_o_trait_, dq_dk_dv_trait_, convert_dq_trait_>(s, a);
-                return r;
-            }}
-"""
-
-@dataclass
-class FmhaBwdDQDKDVApiTrait:
-    pipeline      : str
-    # sync with fmha_bwd_traits<>, to generate fallback calls
-    hdim          : str
-    dtype         : str  # data type
-    mode          : str  # value from MODE_MAP
-    bm0           : int  # tile size along q seqlen (block size)
-    bn0           : int  # tile size along k seqlen
-    bhdq          : int  # q head_dim
-    bhdv          : int  # v head_dim
-    mask          : str
-    bias          : str
-    dbias         : str
-    dropout       : str
-    spad          : str
-    skpad         : str
-    dpad          : str
-    dvpad         : str
-    deterministic : str
-
-    def scheck(self, spad1 : str) -> str:
-        if self.mode == 'group':
-            return 'true' # always support
-        elif self.spad == 't' and spad1 == 't':
-            return f'a.seqlen_q % {self.bm0} != 0'
-        elif self.spad == 'f' and spad1 == 't':
-            return f'a.seqlen_q % {self.bm0} == 0 and a.seqlen_q % 64 != 0'
-        else: # self.skpad == 'f' and skpad1 == 'f'
-            return f'a.seqlen_q % 64 == 0'
-
-    @property
-    def skcheck(self) -> str:
-        if self.mode == 'group':
-            return 'true' # always support
-        elif self.skpad == 't':
-            return f'a.seqlen_k % {self.bn0} != 0'
-        else:
-            return f'a.seqlen_k % {self.bn0} == 0'
-
-    @property
-    def dcheck(self) -> str:
-        if self.dpad == 't': return f'a.hdim_q % {self.bhdq} != 0'
-        else :               return f'a.hdim_q % {self.bhdq} == 0'
-
-    @property
-    def dvcheck(self) -> str:
-        if self.dvpad == 't': return f'a.hdim_v % {self.bhdv} != 0'
-        else :                return f'a.hdim_v % {self.bhdv} == 0'
-
-class FmhaBwdApiPool:
-    def __init__(self, mask_impl):
-        self.dq_dk_dv_pool = dict()
-        self.mask_impl = mask_impl
-
-    def register_dq_dk_dv_traits(self, trait : FmhaBwdDQDKDVApiTrait) -> None:
-        # TODO: do we need to check duplication?
-        if trait.dtype not in self.dq_dk_dv_pool.keys():
-            self.dq_dk_dv_pool[trait.dtype] = dict()
-        if trait.hdim not in self.dq_dk_dv_pool[trait.dtype].keys():
-            self.dq_dk_dv_pool[trait.dtype][trait.hdim] = list()
-
-        self.dq_dk_dv_pool[trait.dtype][trait.hdim].append(copy.copy(trait))
-
-    @property
-    def api(self) -> str:
-        per_dtypes=str()
-        for i, dtype in enumerate(self.dq_dk_dv_pool.keys()):
-            per_hdim_case=str()
-            for j, hdim in enumerate(self.dq_dk_dv_pool[dtype].keys()):
-                traits=self.dq_dk_dv_pool[dtype][hdim]
-                hdim_int = int(hdim)
-                inners=str()
-                for k, trait in enumerate(traits):
-                    if_k = 'if' if k == 0 else 'else if'
-                    for spad1 in ["t", "f"]:
-                        if (spad1 == "f" and (trait.spad == "t" or trait.mode == "group")):
-                            continue
-                        inners = inners + FMHA_BWD_API_INNER_DISPATCH.format(F_if=if_k, F_mode=MODE_MAP[trait.mode], F_pipeline_enum=BWD_DQDKDV_PIPELINE_ENUM_MAP[trait.pipeline],
-                                    F_mask_check=get_mask_check_map(self.mask_impl)[trait.mask], F_mask=get_mask_map(self.mask_impl)[trait.mask], F_bias_check=BIAS_CHECK_MAP[trait.bias],
-                                    F_bias=BIAS_MAP[trait.bias], F_dbias=BOOL_MAP[trait.dbias], F_dropout_check=DROPOUT_CHECK_MAP[trait.dropout], F_dropout=DROPOUT_MAP[trait.dropout],
-                                    F_scheck=trait.scheck(spad1=spad1), F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck, F_hdim=hdim, F_dtype=BWD_DTYPE_MAP[dtype],
-                                    F_spad0=BOOL_MAP[trait.spad], F_spad1=BOOL_MAP[spad1], F_skpad=BOOL_MAP[trait.skpad], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad],
-                                    F_deterministic=BOOL_MAP[trait.deterministic])
-
-                if_j = 'if' if j == 0 else 'else if'
-                per_hdim_case = per_hdim_case + FMHA_BWD_API_PER_HDIM_CASE.format(F_if=if_j, F_hdim=hdim, F_inner_dispatch=inners)
-            if_i = 'if' if i == 0 else 'else if'
-            per_dtypes = per_dtypes + FMHA_BWD_API_PER_DTYPE.format(F_if=if_i, F_dtype=dtype, F_hdim_case=per_hdim_case)
-        if not per_dtypes:
-            # empty string we add some ignore to suppress warning in api
-            per_dtypes += '    (void)t ; (void)s ; (void)a;'
-        return FMHA_BWD_KERNEL_HEADER + FMHA_BWD_API.format(F_dispatch = per_dtypes)
+# M0 size for 1d kernels (dot/convert)
+M0_1D = 64

 # GEMM0: Q@K=S^T
 # GEMM1: P^T@dO^T=dV(This was chosen as G1 to match fwd, but N1 must be equal to headdim_v)
@@ -303,7 +236,7 @@ class FmhaBwdApiPool:
 # GEMM3: dS^T@Q^T=dK(Similar to G1, but N3 must be equal to headdim_qk)
 # GEMM4: dS@K^T=dQ(N4 must be equal to headdim_qk)
 # Is it necessary to distinguish between K0~K4?
-@dataclass
+@dataclass(frozen=True)
 class FmhaBwdDQDKDVTileSize:
    F_bm0       : int  # tile size along q seqlen (block size)
    F_bn0       : int  # tile size along k seqlen
@@ -330,20 +263,20 @@ class FmhaBwdDQDKDVTileSize:
    F_wn1       : int  # warp size along n in gemm1/gemm3
    F_wk1       : int  # warp size along k in gemm1/gemm3
    F_occupancy : int  # occupancy
+    max_seq_q   : int = 0
+
    @property
    def name(self) -> str:
        return f"b{self.F_bm0}x{self.F_bn0}x{self.F_bk0}x{self.F_bk1}x{self.F_bk2}x{self.F_bk3}x{self.F_bk4}x{self.F_bhdq}x{self.F_bhdv}" +\
        f"_r{self.F_rm0}x{self.F_rn0}x{self.F_rk0}_r{self.F_rm1}x{self.F_rn1}x{self.F_rk1}_r{self.F_rm2}x{self.F_rn2}x{self.F_rk2}" +\
-        f"_w{self.F_wm0}x{self.F_wn0}x{self.F_wk0}_w{self.F_wm1}x{self.F_wn1}x{self.F_wk1}_o{self.F_occupancy}"
+        f"_w{self.F_wm0}x{self.F_wn0}x{self.F_wk0}_w{self.F_wm1}x{self.F_wn1}x{self.F_wk1}_o{self.F_occupancy}_maxq{self.max_seq_q}"

-@dataclass
+@dataclass(frozen=True)
 class FmhaBwdDQDKDVKernel:
    F_idx           : int  # this is not a tunable, but a counter to differentiate symbol
    F_hdim          : int  # hdim
    F_dtype         : str  # data type
    F_tile          : FmhaBwdDQDKDVTileSize
-    F_spad          : str  # true/false
-    F_skpad         : str  #
    F_dpad          : str  #
    F_dvpad         : str  #
    F_bias          : str  #
@@ -352,8 +285,8 @@ class FmhaBwdDQDKDVKernel:
    F_mask          : str  # value from MASK_MAP
    F_mode          : str  # value from MODE_MAP
    F_deterministic : str  #
-    F_pipeline      : str  #
    mask_impl       : str  #
+    F_trload       : str  #

    @property
    def template(self) -> str:
@@ -386,8 +319,6 @@ class FmhaBwdDQDKDVKernel:
                F_wm1           = self.F_tile.F_wm1,
                F_wn1           = self.F_tile.F_wn1,
                F_wk1           = self.F_tile.F_wk1,
-                F_spad          = BOOL_MAP[self.F_spad],
-                F_skpad         = BOOL_MAP[self.F_skpad],
                F_dpad          = BOOL_MAP[self.F_dpad],
                F_dvpad         = BOOL_MAP[self.F_dvpad],
                F_bias          = BIAS_MAP[self.F_bias],
@@ -397,21 +328,20 @@ class FmhaBwdDQDKDVKernel:
                F_mask          = get_mask_map(self.mask_impl)[self.F_mask],
                F_mode          = MODE_MAP[self.F_mode],
                F_deterministic = BOOL_MAP[self.F_deterministic],
-                F_pipeline_enum = BWD_DQDKDV_PIPELINE_ENUM_MAP[self.F_pipeline],
-                F_pipeline      = BWD_DQDKDV_PIPELINE_MAP[self.F_pipeline])
+                F_trload        = BOOL_MAP[self.F_trload],
+                F_maxq          = self.F_tile.max_seq_q
+            )

    @property
    def name(self) -> str:
        def pad_name() -> str:
            n = ''
-            if self.F_spad == 't': n += 's'
-            if self.F_skpad == 't' : n += 'sk'
            if self.F_dpad == 't' : n += 'd'
            if self.F_dvpad == 't' : n += 'dv'
            if n != '' : n = 'p' + n
            return n
        pn = pad_name()
-        n = f"fmha_bwd_d{self.F_hdim}_{self.F_dtype}_{self.F_mode}_" + self.F_tile.name + f'_{self.F_pipeline}'
+        n = f"fmha_bwd_d{self.F_hdim}_{self.F_dtype}_{self.F_mode}_" + self.F_tile.name
        if pn != '' : n += f'_{pn}'
        else: n += '_npad'

@@ -433,122 +363,34 @@ class FmhaBwdDQDKDVKernel:

        if self.F_deterministic == 't' : n += '_deterministic'
        else: n += '_ndeterministic'
+
+        if self.F_trload == 't' : n += '_trload'
+        else: n += '_ntrload'
        return n

    @property
    def filename(self) -> str:
        return self.name + ".cpp"

-    def api_trait(self) -> FmhaBwdDQDKDVApiTrait:
-        return FmhaBwdDQDKDVApiTrait(pipeline=self.F_pipeline,
-                hdim=str(self.F_hdim),
-                dtype=self.F_dtype,
-                mode=self.F_mode,
-                bm0=self.F_tile.F_bm0,
-                bn0=self.F_tile.F_bn0,
-                bhdq=self.F_tile.F_bhdq,
-                bhdv=self.F_tile.F_bhdv,
-                mask=self.F_mask,
-                bias=self.F_bias,
-                dbias=self.F_dbias,
-                dropout=self.F_dropout,
-                spad=self.F_spad,
-                skpad=self.F_skpad,
-                dpad=self.F_dpad,
-                dvpad=self.F_dvpad,
-                deterministic=self.F_deterministic
-                )
-
 # TODO: design a more practical way to do it
-# this is current supported tile size & pipeline.
-def get_fmha_bwd_dq_dk_dv_tile_ppl_dict_from_dtype(dtype : str) -> Optional[dict]:
-    if dtype == 'fp16' or dtype == 'bf16':
-        return {
-            '32'  : [FmhaBwdDQDKDVTileSize( 32, 128,  32, 32,  32, 32, 64,  32,  32, 1, 4, 1, 4, 1, 1, 2, 2, 1, 16, 16, 32, 16, 16, 16, 1),
-                        "kr_ktr_vr_iglp", "kr_ktr_vr"],
-            '64'  : [FmhaBwdDQDKDVTileSize( 32, 128,  64, 32,  64, 32, 32,  64,  64, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1),
-                        "kr_ktr_vr_iglp", "kr_ktr_vr"],
-            '128' : [FmhaBwdDQDKDVTileSize( 16, 128, 128, 16, 128, 16, 32, 128, 128, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1),
-                        "kr_ktr_vr_iglp", "kr_ktr_vr"],
-            '256' : [FmhaBwdDQDKDVTileSize( 16,  64, 256, 16, 256, 16, 32, 256, 256, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1),
-                        "kr_ktr_vr_iglp", "kr_ktr_vr"]
-        }
+# this is current supported tile size.
+def get_dq_dk_dv_tiles(dtype : str, tr_load: str) -> List[FmhaBwdDQDKDVTileSize]:
+    if (dtype == 'fp16' or dtype == 'bf16') and tr_load == 'f':
+        return [
+            FmhaBwdDQDKDVTileSize( 32, 128,  32, 32,  32, 32, 64,  32,  32, 1, 4, 1, 4, 1, 1, 2, 2, 1, 16, 16, 32, 16, 16, 16, 1),
+            FmhaBwdDQDKDVTileSize( 32, 128,  64, 32,  64, 32, 32,  64,  64, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1),
+            FmhaBwdDQDKDVTileSize( 16, 128, 128, 16, 128, 16, 32, 128, 128, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1),
+            # FmhaBwdDQDKDVTileSize( 32, 64, 160, 32, 160, 32, 32, 160, 160, 1, 4, 1, 4, 1, 1, 2, 2, 1, 16, 16, 32, 16, 16, 16, 1),
+            FmhaBwdDQDKDVTileSize( 16,  64, 256, 16, 256, 16, 32, 256, 256, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 16, 1),
+        ]
+    elif (dtype == 'fp16' or dtype == 'bf16') and tr_load == 't':
+        return [
+                FmhaBwdDQDKDVTileSize( 32, 128, 128, 32, 128, 32, 32, 128, 128, 1, 4, 1, 4, 1, 1, 1, 4, 1, 16, 16, 32, 16, 16, 32, 1),
+                # FmhaBwdDQDKDVTileSize( 16, 32, 128, 16, 128, 16, 32, 128, 128, 1, 1, 1, 1, 1, 1, 1, 1, 1, 16, 16, 32, 16, 16, 16, 1, 16),
+                FmhaBwdDQDKDVTileSize( 16,  16, 128, 16, 128, 16, 16, 128, 128, 1, 1, 1, 1, 1, 1, 1, 1, 1, 16, 16, 32, 16, 16, 16, 2, 16),
+        ]
    else:
-        return None
-
-def get_bwd_dq_dk_dv_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> Tuple[FmhaBwdApiPool, List[FmhaBwdDQDKDVKernel]]:
-    # TODO: we don't support tuning yet, so pick up one value for pad
-    #       support this in future
-    gen = list()
-    api_pool = FmhaBwdApiPool(mask_impl)
-
-    for dtype in BWD_DTYPE_MAP.keys():
-        d = get_fmha_bwd_dq_dk_dv_tile_ppl_dict_from_dtype(dtype)
-        if d == None:
-            continue
-        for hdim_str, mode, mask, bias, dbias, dropout, spad, skpad, dpad, dvpad, deterministic in itertools.product(d.keys(), MODE_MAP.keys(), get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"], DROPOUT_MAP.keys(), ["t", "f"], ["t", "f"], ["t", "f"], ["t", "f"], ["t", "f"]):
-            tile = d[hdim_str][0]
-            ppl = d[hdim_str][1]
-            hdim = int(hdim_str)
-            if (mode == "group") and (spad == "f" or skpad == "f"):
-                continue
-            if ((bias == "no" or bias == "alibi") and dbias == "t"):
-                continue
-            if ("wg32" in dropout):
-                continue
-            if (dpad == "t" or dvpad == "t"):
-                ppl = d[hdim_str][2]
-            k = FmhaBwdDQDKDVKernel(F_idx=0, F_hdim=hdim, F_dtype=dtype, F_tile=tile,
-                                F_spad=spad, F_skpad=skpad, F_dpad=dpad, F_dvpad=dvpad,
-                                F_bias=bias, F_dbias=dbias, F_dropout=dropout, F_mask=mask, F_mode=mode,
-                                F_pipeline=ppl, mask_impl=mask_impl, F_deterministic=deterministic)
-            if kernel_filter != '':
-                if not fnmatch.fnmatch(k.name, kernel_filter):
-                    continue
-            # Flash attention integration
-            if receipt == 2:
-                    cond = dtype in ['fp16', 'bf16']
-                    cond &= bias in ['no', 'alibi']
-                    cond &= dropout in ['no', 'dropout_wg32',  'dropout_wg16']
-                    cond &= dpad == dvpad
-                    if not cond:
-                        continue
-            elif receipt == 3:
-                    cond = dtype in ['fp16', 'bf16']
-                    cond &= bias in ['no', 'alibi']
-                    cond &= dpad == dvpad
-                    cond &= deterministic == "f"
-                    if not cond:
-                        continue
-            # PyTorch integration
-            elif receipt == 4:
-                    cond = dtype in ['fp16', 'bf16']
-                    cond &= bias in ['no', 'bias']
-                    cond &= dropout in ['no', 'dropout_wg32',  'dropout_wg16']
-                    cond &= dpad == dvpad
-                    cond &= deterministic == "f"
-                    if not cond:
-                        continue
-            # Aiter (mha_bwd) integration
-            elif receipt == 300:
-                    cond = dtype in ['fp16', 'bf16']
-                    cond &= mode == "batch"
-                    cond &= dropout in ['no', 'dropout_wg32',  'dropout_wg16']
-                    cond &= dpad == dvpad
-                    if not cond:
-                        continue
-            # Aiter (mha_varlen_bwd) integration
-            elif receipt == 400:
-                    cond = dtype in ['fp16', 'bf16']
-                    cond &= mode == "group"
-                    cond &= dropout in ['no', 'dropout_wg32',  'dropout_wg16']
-                    cond &= dpad == dvpad
-                    if not cond:
-                        continue
-            api_pool.register_dq_dk_dv_traits(k.api_trait())
-            gen.append(k)
-
-    return (api_pool, gen)
+        return []

 FMHA_BWD_DOT_DO_O_KERNEL_BODY="""
 using fmha_dtype_{F_idx} = {F_dtype};
@@ -560,7 +402,7 @@ using fmha_bwd_dot_do_o_pipeline_problem_{F_idx} = ck_tile::BlockFmhaBwdOGradDot
    typename FmhaBwdTypeConfig<fmha_dtype_{F_idx}>::ODataType,
    typename FmhaBwdTypeConfig<fmha_dtype_{F_idx}>::OGradDataType,
    typename FmhaBwdTypeConfig<fmha_dtype_{F_idx}>::DDataType,
-    /* BlockSize = */ 64,
+    /* BlockSize = M0 = */ 64,
    {F_hdim},
    {F_mode},
    fmha_bwd_dot_do_o_trait_{F_idx}>;
@@ -608,7 +450,7 @@ std::string fmha_bwd_dot_do_o_get_name_<dot_do_o_trait_{F_idx}>()
 }}
 """

-@dataclass
+@dataclass(frozen=True)
 class FmhaBwdOGradDotOKernel:
    F_idx       : int  # this is not a tunable, but a counter to differentiate symbol
    F_hdim      : int  # hdim
@@ -648,44 +490,6 @@ class FmhaBwdOGradDotOKernel:
    def filename(self) -> str:
        return self.name + ".cpp"

-def get_bwd_dot_do_o_blobs(kernel_filter : Optional[str], receipt) -> List[FmhaBwdOGradDotOKernel]:
-    # TODO: we don't support tuning yet, so pick up one value for pad/occupancy
-    #       support this in future
-    def get_occupancy(dtype, hdim):
-        return 2
-
-    gen = list()
-
-    for dtype in BWD_DTYPE_MAP.keys():
-        d = get_fmha_bwd_dq_dk_dv_tile_ppl_dict_from_dtype(dtype)
-        if d == None:
-            continue
-        for hdim_str, mode, spad, dvpad in itertools.product(d.keys(), MODE_MAP.keys(), ["t", "f"], ["t", "f"]):
-            hdim = int(hdim_str)
-            if (mode == "group" and spad == "f"):
-                continue
-            k = FmhaBwdOGradDotOKernel(F_idx=0, F_hdim=hdim, F_dtype=dtype,
-                                F_spad=spad, F_dvpad=dvpad, F_mode=mode,
-                                F_occupancy=get_occupancy(dtype, hdim))
-            if kernel_filter != '':
-                if not fnmatch.fnmatch(k.name, kernel_filter):
-                    continue
-            # Aiter (mha_bwd) integration
-            if receipt == 300:
-                    cond = dtype in ['fp16', 'bf16']
-                    cond &= mode == "batch"
-                    if not cond:
-                        continue
-            # Aiter (mha_varlen_bwd) integration
-            elif receipt == 400:
-                    cond = dtype in ['fp16', 'bf16']
-                    cond &= mode == "group"
-                    if not cond:
-                        continue
-            gen.append(k)
-
-    return gen
-
 FMHA_BWD_CONVERT_DQ_KERNEL_BODY="""
 using fmha_dtype_{F_idx} = {F_dtype};

@@ -752,7 +556,7 @@ std::string fmha_bwd_convert_dq_get_name_<convert_dq_trait_{F_idx}>()
 }}
 """

-@dataclass
+@dataclass(frozen=True)
 class FmhaBwdConvertQGradKernel:
    F_idx           : int  # this is not a tunable, but a counter to differentiate symbol
    F_hdim          : int  # hdim
@@ -764,6 +568,7 @@ class FmhaBwdConvertQGradKernel:
    F_mode          : str  # value from MODE_MAP
    F_occupancy     : int  #
    F_deterministic : str  #
+    disabled        : bool # sometimes this kernel is not used

    @property
    def template(self) -> str:
@@ -800,83 +605,275 @@ class FmhaBwdConvertQGradKernel:
    def filename(self) -> str:
        return self.name + ".cpp"

-def get_bwd_convert_dq_blobs(kernel_filter : Optional[str], receipt) -> List[FmhaBwdConvertQGradKernel]:
-    # TODO: we don't support tuning yet, so pick up one value for pad/occupancy
-    #       support this in future
-    def get_occupancy(dtype, hdim):
-        return 2
+@dataclass(frozen=True)
+class FmhaBwdApiTrait:
+    idx           : int  # this is not a tunable, but a counter to differentiate symbol
+    # sync with fmha_bwd_traits<>, to generate fallback calls
+    hdim          : int
+    dtype         : str  # data type
+    mode          : str  # value from MODE_MAP
+    tile          : FmhaBwdDQDKDVTileSize
+    mask          : str
+    bias          : str
+    dbias         : str
+    dropout       : str
+    spad1d        : str # spad for 1d kernels (dot/convert)
+    dpad          : str
+    dvpad         : str
+    deterministic : str
+    mask_impl     : str
+    tr_load       : str

-    gen = list()
+    @property
+    def bm0(self) -> int:
+        return self.tile.F_bm0
+    @property
+    def bn0(self) -> int:
+        return self.tile.F_bn0
+    @property
+    def bhdq(self) -> int:
+        return self.tile.F_bhdq
+    @property
+    def bhdv(self) -> int:
+        return self.tile.F_bhdv

-    for dtype in BWD_DTYPE_MAP.keys():
-        d = get_fmha_bwd_dq_dk_dv_tile_ppl_dict_from_dtype(dtype)
-        if d == None:
-            continue
-        for hdim_str, mode, spad, dpad, deterministic in itertools.product(d.keys(), MODE_MAP.keys(), ["t", "f"], ["t", "f"], ["t", "f"]):
-            hdim = int(hdim_str)
-            tile = d[hdim_str][0]
-            if (mode == "group" and spad == "f"):
+    @property
+    def scheck(self) -> str:
+        if self.mode == 'group':
+            return 'true' # always support
+        elif self.spad1d == 't':
+            return f'a.seqlen_q % {M0_1D} != 0'
+        else: # self.spad1d == 'f'
+            return f'a.seqlen_q % {M0_1D} == 0'
+
+    @property
+    def dcheck(self) -> str:
+        if self.dpad == 't': return f'a.hdim_q % {self.bhdq} != 0'
+        else :               return f'a.hdim_q % {self.bhdq} == 0'
+
+    @property
+    def dvcheck(self) -> str:
+        if self.dvpad == 't': return f'a.hdim_v % {self.bhdv} != 0'
+        else :                return f'a.hdim_v % {self.bhdv} == 0'
+
+    @property
+    def dot_do_o_kernel(self) -> FmhaBwdOGradDotOKernel:
+        # TODO: we don't support tuning yet, so pick up one value for pad/occupancy
+        #       support this in future
+        def get_occupancy(dtype, hdim):
+            return 2
+
+        return FmhaBwdOGradDotOKernel(F_idx=self.idx, F_hdim=self.hdim, F_dtype=self.dtype, F_spad=self.spad1d,
+            F_dvpad=self.dvpad, F_mode=self.mode, F_occupancy=get_occupancy(self.dtype, self.hdim))
+
+    @property
+    def dq_dk_dv_kernel(self) -> FmhaBwdDQDKDVKernel:
+        return FmhaBwdDQDKDVKernel(F_idx=self.idx, F_hdim=self.hdim, F_dtype=self.dtype, F_tile=self.tile,
+            F_dpad=self.dpad, F_dvpad=self.dvpad, F_bias=self.bias, F_dbias=self.dbias, F_dropout=self.dropout,
+            F_mask=self.mask, F_mode=self.mode, F_deterministic=self.deterministic, mask_impl=self.mask_impl, F_trload=self.tr_load)
+
+    @property
+    def convert_dq_kernel(self) -> FmhaBwdConvertQGradKernel:
+        # TODO: we don't support tuning yet, so pick up one value for pad/occupancy
+        #       support this in future
+        def get_occupancy(dtype, hdim):
+            return 2
+
+        return FmhaBwdConvertQGradKernel(F_idx=self.idx, F_hdim=self.hdim, F_dtype=self.dtype,
+            F_bm0=M0_1D, F_bn0=self.tile.F_bn0, F_spad=self.spad1d, F_dpad=self.dpad,
+            F_mode=self.mode, F_occupancy=get_occupancy(self.dtype, self.hdim),
+            F_deterministic=self.deterministic, disabled=self.tile.max_seq_q != 0)
+
+class FmhaBwdApiPool:
+    def __init__(self, mask_impl):
+        self.dq_dk_dv_pool = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(list))))
+        
+        self.mask_impl = mask_impl
+
+    def register_dq_dk_dv_traits(self, trait : FmhaBwdApiTrait) -> None:
+        # TODO: do we need to check duplication?
+        self.dq_dk_dv_pool[trait.tr_load][trait.tile.max_seq_q][trait.dtype][trait.hdim].append(copy.copy(trait))
+
+    @staticmethod
+    def if_(i: int) -> str:
+        return 'if' if i == 0 else 'else if'
+
+    def _api_innders(self, traits: List[FmhaBwdApiTrait]) -> str:
+        inners = ""
+        i = 0 
+        for trait in traits:
+            inners += FMHA_BWD_API_INNER_DISPATCH.format(F_if=self.if_(i), F_mode=MODE_MAP[trait.mode],
+                F_mask_check=get_mask_check_map(self.mask_impl)[trait.mask], F_mask=get_mask_map(self.mask_impl)[trait.mask], F_bias_check=BIAS_CHECK_MAP[trait.bias],
+                F_bias=BIAS_MAP[trait.bias], F_dbias=BOOL_MAP[trait.dbias], F_dropout_check=DROPOUT_CHECK_MAP[trait.dropout], F_dropout=DROPOUT_MAP[trait.dropout],
+                F_scheck=trait.scheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck, F_hdim=trait.hdim, F_dtype=BWD_DTYPE_MAP[trait.dtype],
+                F_spad1d=BOOL_MAP[trait.spad1d], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad],
+                F_deterministic=BOOL_MAP[trait.deterministic], F_trload=BOOL_MAP[trait.tr_load], F_maxq=trait.tile.max_seq_q,
+                F_convert_dq_enabled=BOOL_MAP[not trait.convert_dq_kernel.disabled])
+            i += 1
+        return inners
+
+    @staticmethod
+    def trload_sort_key(tf):
+        return 0 if tf == 't' else 1  # sort 't' before 'f'
+
+    @staticmethod
+    def max_seq_q_sort_key(max_seq_q):
+        return max_seq_q if max_seq_q != 0 else 1000000  # sort 0 to the end
+
+    @staticmethod
+    def max_seq_q_cond(max_seq_q: int) -> str:
+        if max_seq_q == 0:
+            return 'true /* no seqlen_q limit */'
+        else:
+            return f'a.seqlen_q <= {max_seq_q}'
+
+    @staticmethod
+    def dtype_cond(dtype: str) -> str:
+        return f't.data_type.compare("{dtype}") == 0'
+
+    @staticmethod
+    def hdim_cond(hdim: int) -> str:
+        return f't.hdim_q <= {hdim} && t.hdim_v <= {hdim}'
+
+    @property
+    def api(self) -> str:
+        tr_load_cond_map = {
+            "t": "has_load_tr",
+            "f": "true /* no trload requirement */"
+        }
+        per_tr_load = ''
+        for tr_load in sorted(self.dq_dk_dv_pool.keys(), key=self.trload_sort_key):
+            per_max_seq_q = ''
+            for max_seq_q in sorted(self.dq_dk_dv_pool[tr_load].keys(), key=self.max_seq_q_sort_key):
+                per_dtypes = ''
+                for j, dtype in enumerate(self.dq_dk_dv_pool[tr_load][max_seq_q]):
+                    per_hdim_case = ''
+                    for k, hdim in enumerate(self.dq_dk_dv_pool[tr_load][max_seq_q][dtype]):
+                        traits = self.dq_dk_dv_pool[tr_load][max_seq_q][dtype][hdim]
+                        inners = self._api_innders(traits)
+                        per_hdim_case += FMHA_BWD_API_COND_STATEMENT(if_=k, F_cond=self.hdim_cond(hdim), F_body=inners)
+                    per_dtypes += FMHA_BWD_API_COND_STATEMENT(if_=j, F_cond=self.dtype_cond(dtype), F_body=per_hdim_case)
+                per_max_seq_q += FMHA_BWD_API_COND_STATEMENT(F_cond=self.max_seq_q_cond(max_seq_q), F_body=per_dtypes)
+            per_tr_load += FMHA_BWD_API_COND_STATEMENT(F_cond=tr_load_cond_map[tr_load], F_body=per_max_seq_q, indent=4)
+        if not per_tr_load:
+            # empty string we add some ignore to suppress warning in api
+            per_tr_load += '    (void)t ; (void)s ; (void)a;'
+        result = FMHA_BWD_KERNEL_HEADER + FMHA_BWD_API.format(F_dispatch = per_tr_load)
+        return result.replace('\n\n', '\n')
+
+def get_bwd_blobs(filter_list: str, receipt, mask_impl, optdim_list) -> Tuple[FmhaBwdApiPool, List[FmhaBwdOGradDotOKernel], List[FmhaBwdDQDKDVKernel], List[FmhaBwdConvertQGradKernel]]:
+    if filter_list == '':
+        filter_list = '*@*@*'
+    filters = filter_list.split('@')
+    filters.extend(['*'] * (3 - len(filters)))
+    filter_dot_do_o = filters[0]
+    filter_convert_dq = filters[1]
+    filter_dq_dk_dv = filters[2]
+
+    # use dict as ordered set
+    gen_dot_do_o: Dict[FmhaBwdOGradDotOKernel, Literal[True]] = {}
+    gen_dq_dk_dv: Dict[FmhaBwdDQDKDVKernel, Literal[True]] = {}
+    gen_convert_dq: Dict[FmhaBwdConvertQGradKernel, Literal[True]] = {}
+    api_pool = FmhaBwdApiPool(mask_impl)
+
+    for dtype, tr_load in itertools.product(BWD_DTYPE_MAP.keys(), ["t", "f"]):
+        tiles: Any = get_dq_dk_dv_tiles(dtype, tr_load)
+        for tile, mode, mask, bias, dbias, dropout, spad1d, dpad, dvpad, deterministic in itertools.product(tiles, MODE_MAP.keys(), get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"], DROPOUT_MAP.keys(), *([["t", "f"]] * 4)):
+            assert isinstance(tile, FmhaBwdDQDKDVTileSize), "tile must be FmhaBwdDQDKDVTileSize"
+            hdim = tile.F_bhdq
+            if (mode == "group") and (spad1d == "f"):
                continue
-            k = FmhaBwdConvertQGradKernel(F_idx=0, F_hdim=hdim, F_dtype=dtype, F_bm0=64, F_bn0=tile.F_bn0,
-                                F_spad=spad, F_dpad=dpad, F_mode=mode, F_occupancy=get_occupancy(dtype, hdim), F_deterministic=deterministic)
-            if kernel_filter != '':
-                if not fnmatch.fnmatch(k.name, kernel_filter):
+            if (mode == "group" or ('no' not in mask)) and tile.max_seq_q != 0:
+                continue
+            if ((bias == "no" or bias == "alibi") and dbias == "t"):
+                continue
+            if ("wg32" in dropout):
+                continue
+            if tr_load == "t" and (dpad == "t" or dvpad == "t"):
+                continue  # tr_load cannot work with dpad or dvpad
+            t = FmhaBwdApiTrait(idx=0, hdim=hdim, dtype=dtype, mode=mode,tile=tile,mask=mask, bias=bias, dbias=dbias, dropout=dropout, spad1d=spad1d, dpad=dpad, dvpad=dvpad, deterministic=deterministic, mask_impl=mask_impl, tr_load=tr_load)
+
+            if not fnmatch.fnmatch(t.dot_do_o_kernel.name, filter_dot_do_o):
+                continue
+            if not fnmatch.fnmatch(t.dq_dk_dv_kernel.name, filter_dq_dk_dv):
+                continue
+            if not fnmatch.fnmatch(t.convert_dq_kernel.name, filter_convert_dq):
+                continue
+            if optdim_list != [-1]:
+                if hdim not in optdim_list:
+                    continue
+
+            # Flash attention integration
+            if receipt == 2:
+                cond = dtype in ['fp16', 'bf16']
+                cond &= bias in ['no', 'alibi']
+                cond &= dropout in ['no', 'dropout_wg32',  'dropout_wg16']
+                cond &= dpad == dvpad
+                if not cond:
+                    continue
+            elif receipt == 3:
+                cond = dtype in ['fp16', 'bf16']
+                cond &= bias in ['no', 'alibi']
+                cond &= dpad == dvpad
+                cond &= deterministic == "f"
+                if not cond:
+                    continue
+            # PyTorch integration
+            elif receipt == 4:
+                cond = dtype in ['fp16', 'bf16']
+                cond &= bias in ['no', 'bias']
+                cond &= dropout in ['no', 'dropout_wg32',  'dropout_wg16']
+                cond &= dpad == dvpad
+                cond &= deterministic == "f"
+                if not cond:
                    continue
            # Aiter (mha_bwd) integration
-            if receipt == 300:
-                    cond = dtype in ['fp16', 'bf16']
-                    cond &= mode == "batch"
-                    if not cond:
-                        continue
+            elif receipt == 300:
+                cond = dtype in ['fp16', 'bf16']
+                cond &= mode == "batch"
+                cond &= dropout in ['no', 'dropout_wg32',  'dropout_wg16']
+                if not cond:
+                    continue
            # Aiter (mha_varlen_bwd) integration
            elif receipt == 400:
-                    cond = dtype in ['fp16', 'bf16']
-                    cond &= mode == "group"
-                    if not cond:
-                        continue
-            gen.append(k)
+                cond = dtype in ['fp16', 'bf16']
+                cond &= mode == "group"
+                cond &= dropout in ['no', 'dropout_wg32',  'dropout_wg16']
+                if not cond:
+                    continue
+            # aiter::mha_bwd C++ api integration
+            elif receipt == 600:
+                cond = dtype in ['fp16', 'bf16']
+                if not cond:
+                    continue
+            gen_dot_do_o[t.dot_do_o_kernel] = True
+            gen_dq_dk_dv[t.dq_dk_dv_kernel] = True
+            if not t.convert_dq_kernel.disabled:
+                gen_convert_dq[t.convert_dq_kernel] = True
+            api_pool.register_dq_dk_dv_traits(t)

-    return gen
+    return api_pool, list(gen_dot_do_o.keys()), list(gen_dq_dk_dv.keys()), list(gen_convert_dq.keys())

-def write_single_bwd_dq_dk_dv_kernel(kernel: FmhaBwdDQDKDVKernel, autogen_dir: Path) -> None:
-    (autogen_dir / kernel.filename).write_text(kernel.template)
+def write_blobs(output_dir : Path, filter_list : str, receipt, optdim_list, mask_impl) -> None:
+    api_pool, kernels_dot_do_o,  kernels_dq_dk_dv,  kernels_convert_dq = get_bwd_blobs(filter_list, receipt, mask_impl, optdim_list)
+    update_file(output_dir / FMHA_BWD_API_FILENAME, api_pool.api)
+    for k in kernels_dot_do_o:
+        update_file(output_dir / k.filename, k.template)
+    for k in kernels_convert_dq:
+        update_file(output_dir / k.filename, k.template)
+    for k in kernels_dq_dk_dv:
+        update_file(output_dir / k.filename, k.template)

-def write_single_bwd_dot_do_o_kernel(kernel: FmhaBwdOGradDotOKernel, autogen_dir: Path) -> None:
-    (autogen_dir / kernel.filename).write_text(kernel.template)

-def write_single_bwd_convert_dq_kernel(kernel: FmhaBwdConvertQGradKernel, autogen_dir: Path) -> None:
-    (autogen_dir / kernel.filename).write_text(kernel.template)
-
-def write_bwd_api(api_pool : FmhaBwdApiPool, autogen_dir: Path) -> None:
-    (autogen_dir / FMHA_BWD_API_FILENAME).write_text(api_pool.api)
-
-def write_blobs(output_dir : Path, filter_list : str, receipt, mask_impl) -> None:
-    filter_list = filter_list.split('@')
-    filter_list.extend([''] * (3 - len(filter_list)))
-
-    kernels = get_bwd_dot_do_o_blobs(filter_list[0], receipt)
-    for kernel in kernels:
-        write_single_bwd_dot_do_o_kernel(kernel, output_dir)
-    kernels = get_bwd_convert_dq_blobs(filter_list[1], receipt)
-    for kernel in kernels:
-        write_single_bwd_convert_dq_kernel(kernel, output_dir)
-    api_pool, kernels = get_bwd_dq_dk_dv_blobs(filter_list[2], receipt, mask_impl)
-    for kernel in kernels:
-        write_single_bwd_dq_dk_dv_kernel(kernel, output_dir)
-    write_bwd_api(api_pool, output_dir)
-
-def list_blobs(file_path : Path, filter_list : str, receipt, mask_impl) -> None:
-    filter_list = filter_list.split('@')
-    filter_list.extend([''] * (3 - len(filter_list)))
-
-    with file_path.open('a') as f:
-        kernels = get_bwd_dot_do_o_blobs(filter_list[0], receipt)
-        for kernel in kernels:
-            f.write(str(file_path.parent / GEN_DIR / kernel.filename) + "\n")
-        kernels = get_bwd_convert_dq_blobs(filter_list[1], receipt)
-        for kernel in kernels:
-            f.write(str(file_path.parent / GEN_DIR / kernel.filename) + "\n")
-        _, kernels = get_bwd_dq_dk_dv_blobs(filter_list[2], receipt, mask_impl)
-        for kernel in kernels:
-            f.write(str(file_path.parent / GEN_DIR / kernel.filename) + "\n")
+def list_blobs(file_path: Path, filter_list: str, receipt, optdim_list, mask_impl) -> None:
+    _, kernels_dot_do_o, kernels_dq_dk_dv, kernels_convert_dq = get_bwd_blobs(
+        filter_list, receipt, mask_impl, optdim_list
+    )
+    with file_path.open("a") as f:
+        for k in kernels_dot_do_o:
+            f.write(str(file_path.parent / GEN_DIR / k.filename) + "\n")
+        for k in kernels_dq_dk_dv:
+            f.write(str(file_path.parent / GEN_DIR / k.filename) + "\n")
+        for k in kernels_convert_dq:
+            f.write(str(file_path.parent / GEN_DIR / k.filename) + "\n")
        f.write(str(file_path.parent / GEN_DIR / FMHA_BWD_API_FILENAME) + "\n")
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
@@ -3,14 +3,16 @@
 # generate kernel instances to speed up compilation

 import copy
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 import fnmatch
 import itertools
+import os
 from pathlib import Path
 from typing import List, Optional, Tuple

 from codegen.cmake_config import *
 from codegen.cpp_symbol_map import *
+from codegen.utils import update_file


 DTYPE_BITS = {
@@ -26,12 +28,14 @@ K0_MAX_SUBMAX_MAP = {
    64 : 64,
    96 : 128,
    128: 128,
+    192: 192,
    256: 256
 }

 FMHA_FWD_KERNEL_HEADER = """// SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.\n
 // auto generated by generate.py
+#include "ck_tile/ops/fmha/block/variants.hpp"
 #include "fmha_fwd.hpp"
 """

@@ -51,12 +55,17 @@ using fmha_trait_{F_idx} = ck_tile::TileFmhaTraits<{F_spad},
                                                    {F_skpad},
                                                    {F_dpad},
                                                    {F_dvpad},
+                                                    {F_logits},
                                                    {F_bias},
                                                    false,
                                                    {F_lse},
                                                    {F_dropout},
                                                    {F_squant},
-                                                    {F_occupancy}>;
+                                                    {F_occupancy},
+                                                    {F_skip}>;
+
+using fmha_variant_{F_idx} = ck_tile::ComposedAttention<{F_logits} * ck_tile::LOGITS_SOFT_CAP, CK_TILE_FMHA_FWD_FAST_EXP2>;
+
 using fmha_mask_{F_idx} = {F_mask};

 using fmha_pipeline_problem_{F_idx} = ck_tile::BlockFmhaPipelineProblem<
@@ -73,7 +82,9 @@ using fmha_pipeline_problem_{F_idx} = ck_tile::BlockFmhaPipelineProblem<
    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::ODataType,
    fmha_shape_{F_idx},
    {F_mode},
+    fmha_variant_{F_idx},
    fmha_mask_{F_idx},
+    {F_trload},
    fmha_trait_{F_idx}>;

 using fmha_pipeline_{F_idx} = {F_pipeline}<
@@ -88,7 +99,7 @@ using fmha_kernel_{F_idx} =
    ck_tile::FmhaFwdKernel<fmha_pipeline_{F_idx}, fmha_epilogue_{F_idx}>;

 using trait_{F_idx} = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode},{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout},
-                        {F_pipeline_enum}, fmha_mask_{F_idx}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>;
+                        {F_pipeline_enum}, {F_logits}, fmha_mask_{F_idx}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_trload}, {F_skip}>;

 #include <iostream>

@@ -107,13 +118,64 @@ float fmha_fwd_<trait_{F_idx}>(const ck_tile::stream_config& s, fmha_fwd_args a)

 FMHA_FWD_API_FILENAME="fmha_fwd_api.cpp"
 FMHA_FWD_API="""
+#include <cstdio>
+
+#include <hip/hip_runtime.h>
+
+namespace {{
+bool get_num_cus(unsigned& num_cus) {{
+    int device;
+    auto status = hipGetDevice(&device);
+    if(status != hipSuccess) {{
+        fprintf(stderr, "failed to get device");
+        return false;
+    }}
+
+    hipDeviceProp_t props{{}};
+    status = hipGetDeviceProperties(&props, device);
+    if(status != hipSuccess) {{
+        fprintf(stderr, "failed to get device properties");
+        return false;
+    }}
+
+    num_cus = props.multiProcessorCount;
+    return true;
+}}
+
+unsigned get_num_thread_blocks(unsigned batch, unsigned nheads, unsigned max_seqlen_q, unsigned kM0) {{
+    const unsigned num_m_blocks = (max_seqlen_q + kM0 - 1) / kM0;
+    const unsigned num_n_blocks = 1; // we assume that num_n_blocks is always 1
+
+    return batch * nheads * num_m_blocks * num_n_blocks;
+}}
+}} // namespace
+
 float fmha_fwd(fmha_fwd_traits t, fmha_fwd_args a, const ck_tile::stream_config& s){{
    float r = -1;
+
+    [[maybe_unused]] const float min_cu_util_rate = 0.8; // minimum CU utilization rate
+
+    unsigned num_cus;
+    if (!get_num_cus(num_cus)) {{
+        return r;
+    }}
+
+    [[maybe_unused]] auto get_num_blocks = [&](unsigned kM0) {{
+        return get_num_thread_blocks(a.batch, a.nhead_q, a.max_seqlen_q, kM0);
+    }};
+    
+    const bool has_load_tr = ck_tile::is_load_tr_supported();
+
 {F_dispatch}
    return r;
 }}
 """

+FMHA_FWD_API_PER_TRLOAD="""    {F_if}({F_trload_cond}){{
+{F_dtype_case}
+    }}
+"""
+
 FMHA_FWD_API_PER_DTYPE="""    {F_if}(t.data_type.compare(\"{F_dtype}\") == 0){{
 {F_hdim_case}
    }}
@@ -123,52 +185,75 @@ FMHA_FWD_API_PER_HDIM_CASE="""        {F_if} (t.hdim_q <= {F_hdim} && t.hdim_v <
        }}
 """

-FMHA_FWD_API_INNER_DISPATCH="""            {F_if}((t.is_group_mode == {F_mode}) && (t.is_v_rowmajor == {F_vlayout}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.has_lse == {F_lse})  && (t.has_dropout == {F_dropout}) && (t.do_fp8_static_quant == {F_squant}) &&
-                        ({F_scheck}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck})) {{
-                using trait_ = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, {F_mask}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>;
+FMHA_FWD_API_INNER_DISPATCH="""            {F_if}((t.is_group_mode == {F_mode}) && (t.is_v_rowmajor == {F_vlayout}) && (t.has_logits_soft_cap == {F_logits}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.has_lse == {F_lse})  && (t.has_dropout == {F_dropout}) && (t.do_fp8_static_quant == {F_squant}) && (t.skip_min_seqlen_q == {F_skip}) &&
+                        ({F_scheck}) && ({F_seqtune}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck}) && ({F_constraint})) {{
+                using trait_ = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, {F_logits}, {F_mask}, {F_bias}, {F_lse}, {F_dropout}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_trload}, {F_skip}>;
                return fmha_fwd_<trait_>(s, a);
            }}
 """

+@dataclass
+class CppConstraint:
+    bool_expr: str = None
+
+    def __str__(self):
+        if self.bool_expr is None:
+            return 'true'
+        else:
+            return f'{self.bool_expr}'
+
+    def __and__(self, other):
+        return CppConstraint(f'({str(self)}) && ({str(other)})')
+
@dataclass
 class FmhaFwdApiTrait:
    pipeline_tag : str
    # sync with fmha_fwd_traits<>, to generate fallback calls
-    hdim      : str
-    dtype     : str  # data type
-    mode      : str  # value from MODE_MAP
-    bm0       : int  # tile size along q seqlen (block size)
-    bn0       : int  # tile size along qk seqlen
-    bk0       : int  # tile size along qk gemm unroll
-    bn1       : int  # tile size along v head_dim
-    bk1       : int  # tile size along kv gemm unroll
-    bk0max    : int
-    vlayout   : str
-    mask      : str
-    bias      : str  #
-    lse       : str  #
-    dropout   : str
-    squant    : str  #
-    spad      : str
-    skpad     : str
-    dpad      : str
-    dvpad     : str
+    hdim       : str
+    dtype      : str  # data type
+    mode       : str  # value from MODE_MAP
+    bm0        : int  # tile size along q seqlen (block size)
+    bn0        : int  # tile size along qk seqlen
+    bk0        : int  # tile size along qk gemm unroll
+    bn1        : int  # tile size along v head_dim
+    bk1        : int  # tile size along kv gemm unroll
+    bk0max     : int
+    vlayout    : str
+    logits     : str
+    mask       : str
+    bias       : str  #
+    lse        : str  #
+    dropout    : str
+    squant     : str  #
+    spad       : str
+    skpad      : str
+    dpad       : str
+    dvpad      : str
+    skip       : str
+    tr_load    : str
+    constraint : CppConstraint

    @property
    def name(self) -> str:
        return f'{self.hdim}-{self.dtype}-{self.mode}-{self.bm0}-{self.bn0}-{self.bk0}-{self.bn0}-{self.bk1}-{self.bk0max}-'+\
-                    f'{self.vlayout}-{self.mask}-{self.bias}-{self.lse}-{self.dropout}-{self.squant}-{self.spad}-{self.skpad}-{self.dpad}-{self.dvpad}'
+                    f'{self.vlayout}-{self.logits}-{self.mask}-{self.bias}-{self.lse}-{self.dropout}-{self.squant}-{self.spad}-{self.skpad}-{self.dpad}-{self.dvpad}-{self.skip}'

    @property
    def scheck(self) -> str:
        if self.mode == 'group': return 'true/*group mode spad always true*/'                  # group mode only generate spad/skpad == true
-        if self.pipeline_tag == 'qr_async':
+        if self.pipeline_tag in ['qr_async', 'qr_async_trload']:
            if self.spad == 't' : return 'true' # always support
            else :                return 'true'
-        elif self.pipeline_tag in ['qr']:
+        elif self.pipeline_tag in ['qr', 'qs']:
            if self.spad == 't' : return f'true /*a.seqlen_q % {self.bm0} != 0*/'  # TODO: order of get_pipelines() matters! (ugly)
            else :                return f'a.seqlen_q % {self.bm0} == 0'
        else: assert False
+    
+    @property
+    def seqtune(self) -> str:
+        if self.bm0 == 128: return 'true/*fall back to largest tile*/'                  # group mode only generate spad/skpad == true
+        else: 
+            return f'a.seqlen_q <= {self.bm0}'

    @property
    def skcheck(self) -> str:
@@ -176,9 +261,12 @@ class FmhaFwdApiTrait:
        if self.pipeline_tag == 'qr_async':
            if self.skpad == 't' : return f'a.seqlen_k == 0 || a.seqlen_k % {self.bn0} != 0'
            else :                 return f'a.seqlen_k != 0 && a.seqlen_k % {self.bn0} == 0'
-        elif self.pipeline_tag in ['qr', 'qr_fp8']:
+        elif self.pipeline_tag in ['qr', 'qs']:
            if self.skpad == 't' : return f'true /*a.seqlen_k % {self.bn0} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
            else :                return f'a.seqlen_k % {self.bn0} == 0'
+        elif self.pipeline_tag == 'qr_async_trload':
+            if self.skpad == 't' : return 'true'
+            else:                  return 'true'
        else: assert False

    @property
@@ -187,7 +275,7 @@ class FmhaFwdApiTrait:
            vec = int((32 * 4) / DTYPE_BITS[self.dtype])
            if self.dpad == 't': return f'a.hdim_q % {vec} == 0'
            else :               assert False
-        elif self.pipeline_tag in ['qr']:
+        elif self.pipeline_tag in ['qr', 'qs', 'qr_async_trload']:
            bk0submax = K0_MAX_SUBMAX_MAP[self.bk0max]
            if self.dpad == 't': return f'true /*a.hdim_q % {bk0submax} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
            else :               return f'a.hdim_q % {bk0submax} == 0'
@@ -199,7 +287,7 @@ class FmhaFwdApiTrait:
            vec = int((32 * 4) / DTYPE_BITS[self.dtype])
            if self.dvpad == 't': return f'a.hdim_v % {vec} == 0'
            else :                assert False
-        elif self.pipeline_tag in ['qr']:
+        elif self.pipeline_tag in ['qr', 'qs', 'qr_async_trload']:
            bk0submax = K0_MAX_SUBMAX_MAP[self.bk0max]
            if self.dvpad == 't': return f'true /*a.hdim_v % {bk0submax} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
            else :                return f'a.hdim_v % {bk0submax} == 0'
@@ -209,16 +297,20 @@ class FmhaFwdApiTrait:
 class FmhaFwdPipeline:
    tag : str

-    F_vlayout   : str  # row/col
-    F_spad      : str  # true/false
-    F_skpad     : str  #
-    F_dpad      : str  #
-    F_dvpad     : str  #
-    F_bias      : str  # true/false
-    F_lse       : str  #
-    F_dropout   : str  #
-    F_squant    : str  #
-    F_mask      : str  # value from MASK_MAP
+    F_vlayout    : str  # row/col
+    F_spad       : str  # true/false
+    F_skpad      : str  #
+    F_dpad       : str  #
+    F_dvpad      : str  #
+    F_logits     : str  # t/f
+    F_bias       : str  # true/false
+    F_lse        : str  #
+    F_dropout    : str  #
+    F_squant     : str  #
+    F_mask       : str  # value from MASK_MAP
+    F_skip       : str  # true/false
+    F_trload     : str  # true/false
+    F_constraint : CppConstraint = field(default_factory=lambda: CppConstraint())

    @property
    def name(self) -> str:
@@ -235,6 +327,9 @@ class FmhaFwdPipeline:
        if pn != '' : n += f'_{pn}'
        else: n += '_npad'

+        if self.F_logits == 't' : n += '_logits'
+        else: n += '_nlogits'
+
        if self.F_bias != 'no' : n += f'_{self.F_bias}'
        else: n += '_nbias'

@@ -251,8 +346,15 @@ class FmhaFwdPipeline:
        if self.F_dropout == 't' : n += '_dropout'
        else: n += '_ndropout'

+        if self.F_skip == 't' : n += '_skip'
+        else: n += '_nskip'
+
        if self.F_squant == 't' : n += '_squant'
        else: n += '_nsquant'
+        
+        if self.F_trload == 't' : n += '_trload'
+        else: n += '_ntrload'
+
        return n

 class FmhaFwdApiPool:
@@ -264,59 +366,71 @@ class FmhaFwdApiPool:
        # TODO: do we need to check duplication?
        if trait.dtype not in self.pool.keys():
            self.pool[trait.dtype] = dict()
-        if trait.hdim not in self.pool[trait.dtype].keys():
-            self.pool[trait.dtype][trait.hdim] = list()
+        hdim = trait.hdim, trait.bn1
+        if hdim not in self.pool[trait.dtype].keys():
+            self.pool[trait.dtype][hdim] = list()

-        self.pool[trait.dtype][trait.hdim].append(copy.copy(trait))
+        self.pool[trait.dtype][hdim].append(copy.copy(trait))

    @property
    def api(self) -> str:
-        per_dtypes=str()
-        for i, dtype in enumerate(self.pool.keys()):
-            per_hdim_case=str()
-            for j, hdim in enumerate(self.pool[dtype].keys()):
-                traits=self.pool[dtype][hdim]
-                inners=str()
-                for k, trait in enumerate(traits):
-                    if_k = 'if' if k == 0 else 'else if'
-                    inners = inners + FMHA_FWD_API_INNER_DISPATCH.format(F_if=if_k, F_mode=MODE_MAP[trait.mode], F_vlayout=LAYOUT_MAP[trait.vlayout],
-                                   F_pipeline_enum=PIPELINE_ENUM_MAP[trait.pipeline_tag], F_mask=get_mask_map(self.mask_impl)[trait.mask],
-                                   F_mask_check=get_mask_check_map(self.mask_impl)[trait.mask], F_bias_check=BIAS_CHECK_MAP[trait.bias], F_bias=BIAS_MAP[trait.bias],
-                                   F_lse=BOOL_MAP[trait.lse], F_dropout=BOOL_MAP[trait.dropout] ,
-                                   F_squant=BOOL_MAP[trait.squant], F_scheck=trait.scheck, F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck,
-                                   F_spad=BOOL_MAP[trait.spad], F_skpad=BOOL_MAP[trait.skpad], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad],
-                                   F_bm0=trait.bm0, F_bn0=trait.bn0, F_bk0=trait.bk0, F_bn1=trait.bn1, F_bk1=trait.bk1, F_bk0max=trait.bk0max,
-                                   F_hdim=hdim, F_dtype=FWD_DTYPE_MAP[dtype])
-                if_j = 'if' if j == 0 else 'else if'
-                per_hdim_case = per_hdim_case + FMHA_FWD_API_PER_HDIM_CASE.format(F_if=if_j, F_hdim=hdim, F_hdim_v=trait.bn1, F_inner_dispatch=inners)
-            if_i = 'if' if i == 0 else 'else if'
-            per_dtypes = per_dtypes + FMHA_FWD_API_PER_DTYPE.format(F_if=if_i, F_dtype=dtype, F_hdim_case=per_hdim_case)
-        if not per_dtypes:
+        tr_load_cond_map = {
+            "t": "has_load_tr",
+            "f": "true"
+        }
+        
+        per_tr_load =str()
+        for tr_load in ["t", "f"]:
+            per_dtypes=str()
+            for i, dtype in enumerate(self.pool.keys()):
+                per_hdim_case=str()
+                for j, (hdim, hdim_v) in enumerate(self.pool[dtype].keys()):
+                    traits=self.pool[dtype][(hdim, hdim_v)]
+                    inners=str()
+                    for k, trait in enumerate(traits):
+                        if_k = 'if' if k == 0 else 'else if'
+                        inners = inners + FMHA_FWD_API_INNER_DISPATCH.format(F_if=if_k, F_mode=MODE_MAP[trait.mode], F_vlayout=LAYOUT_MAP[trait.vlayout],
+                                       F_pipeline_enum=PIPELINE_ENUM_MAP[trait.pipeline_tag], F_logits=BOOL_MAP[trait.logits], F_mask=get_mask_map(self.mask_impl)[trait.mask],
+                                       F_mask_check=get_mask_check_map(self.mask_impl)[trait.mask], F_bias_check=BIAS_CHECK_MAP[trait.bias], F_bias=BIAS_MAP[trait.bias],
+                                       F_lse=BOOL_MAP[trait.lse], F_dropout=BOOL_MAP[trait.dropout], F_skip=BOOL_MAP[trait.skip], F_trload=BOOL_MAP[trait.tr_load],
+                                       F_squant=BOOL_MAP[trait.squant], F_scheck=trait.scheck, F_seqtune=trait.seqtune, F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck,
+                                       F_constraint=trait.constraint,
+                                       F_spad=BOOL_MAP[trait.spad], F_skpad=BOOL_MAP[trait.skpad], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad],
+                                       F_bm0=trait.bm0, F_bn0=trait.bn0, F_bk0=trait.bk0, F_bn1=trait.bn1, F_bk1=trait.bk1, F_bk0max=trait.bk0max,
+                                       F_hdim=hdim, F_dtype=FWD_DTYPE_MAP[dtype])
+                    if_j = 'if' if j == 0 else 'else if'
+                    per_hdim_case = per_hdim_case + FMHA_FWD_API_PER_HDIM_CASE.format(F_if=if_j, F_hdim=hdim, F_hdim_v=hdim_v, F_inner_dispatch=inners)
+                if_i = 'if' if i == 0 else 'else if'
+                per_dtypes = per_dtypes + FMHA_FWD_API_PER_DTYPE.format(F_if=if_i, F_dtype=dtype, F_hdim_case=per_hdim_case)
+            per_tr_load += FMHA_FWD_API_PER_TRLOAD.format(F_if='if', F_trload_cond=tr_load_cond_map[tr_load], F_dtype_case=per_dtypes)
+        if not per_tr_load:
            # empty string we add some ignore to suppress warning in api
-            per_dtypes += '    (void)t ; (void)s ; (void)a;'
-        return FMHA_FWD_KERNEL_HEADER + FMHA_FWD_API.format(F_dispatch = per_dtypes)
+            per_tr_load += '    (void)t ; (void)s ; (void)a;'
+        return FMHA_FWD_KERNEL_HEADER + FMHA_FWD_API.format(F_dispatch = per_tr_load)

@dataclass
 class FmhaFwdTileSize:
-    F_bm0       : int  # tile size along q seqlen (block size)
-    F_bn0       : int  # tile size along k seqlen
-    F_bk0       : int  # tile size along qk gemm unroll
-    F_bn1       : int  # tile size along v head_dim
-    F_bk1       : int  # tile size along kv gemm unroll
-    F_bk0max    : int  # total length of K0, used for pipeline that need load Q at once (or repeately load Q as a whole tile)
-    F_rm0       : int  # number of warps for gemm0 along q seqlen
-    F_rn0       : int  # number of warps for gemm0 along k seqlen
-    F_rk0       : int  # number of warps for gemm0 along head dim q (not used)
-    F_rm1       : int  # number of warps for gemm1 along q seqlen
-    F_rn1       : int  # number of warps for gemm1 along head dim v
-    F_rk1       : int  # number of warps for gemm1 along k seqlen (not used)
-    F_wm0       : int  # gemm0 warp size along m
-    F_wn0       : int  # gemm0 warp size along n
-    F_wk0       : int  # gemm0 warp size along k
-    F_wm1       : int  # gemm1 warp size along m
-    F_wn1       : int  # gemm1 warp size along n
-    F_wk1       : int  # gemm1 warp size along k
-    F_occupancy : int  # occupancy, -1 will let pipeline decide the occupancy, other value will overwrite occupancy
+    F_bm0        : int  # tile size along q seqlen (block size)
+    F_bn0        : int  # tile size along k seqlen
+    F_bk0        : int  # tile size along qk gemm unroll
+    F_bn1        : int  # tile size along v head_dim
+    F_bk1        : int  # tile size along kv gemm unroll
+    F_bk0max     : int  # total length of K0, used for pipeline that need load Q at once (or repeately load Q as a whole tile)
+    F_rm0        : int  # number of warps for gemm0 along q seqlen
+    F_rn0        : int  # number of warps for gemm0 along k seqlen
+    F_rk0        : int  # number of warps for gemm0 along head dim q (not used)
+    F_rm1        : int  # number of warps for gemm1 along q seqlen
+    F_rn1        : int  # number of warps for gemm1 along head dim v
+    F_rk1        : int  # number of warps for gemm1 along k seqlen (not used)
+    F_wm0        : int  # gemm0 warp size along m
+    F_wn0        : int  # gemm0 warp size along n
+    F_wk0        : int  # gemm0 warp size along k
+    F_wm1        : int  # gemm1 warp size along m
+    F_wn1        : int  # gemm1 warp size along n
+    F_wk1        : int  # gemm1 warp size along k
+    F_occupancy  : int  # occupancy, -1 will let pipeline decide the occupancy, other value will overwrite occupancy
+    F_constraint : CppConstraint = field(default_factory=lambda: CppConstraint())
+
    @property
    def name(self) -> str:
        return f"b{self.F_bm0}x{self.F_bn0}x{self.F_bk0}x{self.F_bn1}x{self.F_bk1}x{self.F_bk0max}" +\
@@ -365,15 +479,18 @@ class FmhaFwdKernel:
                F_skpad         = BOOL_MAP[self.F_pipeline.F_skpad],
                F_dpad          = BOOL_MAP[self.F_pipeline.F_dpad],
                F_dvpad         = BOOL_MAP[self.F_pipeline.F_dvpad],
+                F_logits        = BOOL_MAP[self.F_pipeline.F_logits],
                F_bias          = BIAS_MAP[self.F_pipeline.F_bias],
                F_lse           = BOOL_MAP[self.F_pipeline.F_lse],
                F_dropout       = BOOL_MAP[self.F_pipeline.F_dropout],
                F_squant        = BOOL_MAP[self.F_pipeline.F_squant],
+                F_skip          = BOOL_MAP[self.F_pipeline.F_skip],
                F_occupancy     = self.F_tile.F_occupancy,
                F_pipeline_enum = PIPELINE_ENUM_MAP[self.F_pipeline.tag],
                F_mask          = get_mask_map(self.mask_impl)[self.F_pipeline.F_mask],
                F_mode          = MODE_MAP[self.F_mode],
-                F_pipeline      = PIPELINE_MAP[self.F_pipeline.tag])
+                F_pipeline      = PIPELINE_MAP[self.F_pipeline.tag],
+                F_trload        = BOOL_MAP[self.F_pipeline.F_trload])

    @property
    def name(self) -> str:
@@ -399,6 +516,7 @@ class FmhaFwdKernel:
                bk0max=self.F_tile.F_bk0max,
                vlayout=self.F_pipeline.F_vlayout,
                mask=self.F_pipeline.F_mask,
+                logits=self.F_pipeline.F_logits,
                bias=self.F_pipeline.F_bias,
                lse=self.F_pipeline.F_lse,
                dropout=self.F_pipeline.F_dropout,
@@ -406,33 +524,45 @@ class FmhaFwdKernel:
                spad=self.F_pipeline.F_spad,
                skpad=self.F_pipeline.F_skpad,
                dpad=self.F_pipeline.F_dpad,
-                dvpad=self.F_pipeline.F_dvpad)
+                dvpad=self.F_pipeline.F_dvpad,
+                skip=self.F_pipeline.F_skip,
+                tr_load=self.F_pipeline.F_trload,
+                constraint=self.F_tile.F_constraint & self.F_pipeline.F_constraint)

-# TODO: design a more practical way to do it
-# this is current supported tile size per hdim
-def get_fmha_fwd_tile_dict_from_dtype(dtype : str) -> Optional[dict]:
-    if dtype == 'fp16' or dtype == 'bf16':
-        return {
-            '32'  : FmhaFwdTileSize(128, 64,  16, 32,  32,  32,   2, 1, 1,  2, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
-            '64'  : FmhaFwdTileSize(128, 64,  32, 64,  32,  64,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
-        ### '96'  : FmhaFwdTileSize(128, 128, 32, 128, 32,  96,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
-            '128' : FmhaFwdTileSize(128, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
-            '192' : FmhaFwdTileSize(128, 128, 32, 128, 32,  192,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
-            '256' : FmhaFwdTileSize(128, 128, 32, 256, 32,  256,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
-        }
-    elif dtype == 'fp8' or dtype == 'bf8':
-        return {
-            '64'  : FmhaFwdTileSize(128, 64,  32, 64,  32,  64,   2, 1, 1,  2, 1, 1,  32, 32, 32,  32, 32, 32,  -1),
-            '128' : FmhaFwdTileSize(128, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  32, 32, 32,  32, 32, 32,  -1),
-            '256' : FmhaFwdTileSize(128, 128, 32, 256, 32,  256,  4, 1, 1,  4, 1, 1,  32, 32, 32,  32, 32, 32,  -1),
-        }
-    else:
-        return None
+class KernelComponentFactory:
+    # TODO: design a more practical way to do it
+    # this is current supported tile size per hdim
+    @staticmethod
+    def get_hdim_tile_size_dict(dtype : str) -> Optional[dict]:
+        if dtype == 'fp16' or dtype == 'bf16':
+            return {
+                (32, 32)  : [FmhaFwdTileSize(128, 64,  16, 32,  32,  32,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
+                (64, 64)  : [FmhaFwdTileSize(16, 32,  64, 64,  32,  64,   1, 1, 1,  1, 1, 1,  16, 16, 32,  16, 16, 32,  -1),
+                             FmhaFwdTileSize(32, 32,  64, 64,  32,  64,   1, 1, 1,  1, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
+                             FmhaFwdTileSize(128, 64,  32, 64,  32,  64,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
+                (96, 128) : [FmhaFwdTileSize(128, 128, 32, 128, 32,  96,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
+                (128,128) : [FmhaFwdTileSize(16, 32, 64, 128, 32,  128,  1, 1, 1,  1, 1, 1,  16, 16, 32,  16, 16, 32,  -1),
+                             FmhaFwdTileSize(32, 32, 128, 128, 32,  128,  1, 1, 1,  1, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
+                             FmhaFwdTileSize(128, 64, 32, 128, 16,  128,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
+                             FmhaFwdTileSize(128, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
+                # (160,160) : [FmhaFwdTileSize(128, 128, 32, 160, 32,  160,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,   1)],
+                (192,128) : [FmhaFwdTileSize(128, 128, 32, 128, 32,  192,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
+                (192,192) : [FmhaFwdTileSize(128, 128, 32, 192, 32,  192,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,   1)],
+                (256,256) : [FmhaFwdTileSize(128, 128, 32, 256, 32,  256,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
+            }
+        elif dtype == 'fp8' or dtype == 'bf8':
+            return {
+                (64,64 )  : [FmhaFwdTileSize(128, 64,  32, 64,  32,  64,   2, 1, 1,  2, 1, 1,  32, 32, 32,  32, 32, 32,  -1)],
+                (128,128) : [FmhaFwdTileSize(128, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  32, 32, 32,  32, 32, 32,  -1)],
+                (256,256) : [FmhaFwdTileSize(128, 128, 32, 256, 32,  256,  4, 1, 1,  4, 1, 1,  32, 32, 32,  32, 32, 32,  -1)],
+            }
+        else:
+            return None

-def get_fwd_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> Tuple[FmhaFwdApiPool, List[FmhaFwdKernel]]:
    # TODO: we don't support tuning yet, so pick up one value for vlayout/pipeline/pad
    #       support this in future
-    def get_pipelines(dtype, hdim) -> List[FmhaFwdPipeline]:
+    @staticmethod
+    def get_pipelines(dtype, hdim, hdim_v, receipt, mask_impl) -> List[FmhaFwdPipeline]:
        # this function will populate a list possible pipelines
        # TODO: the order of List matters! the later in this list will be also be checked later
        # TODO: currently for qr pipeline, let 't' padding to appear later!!
@@ -440,33 +570,29 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> Tuple[Fm
        squant = 't' if dtype == 'fp8' else 'f'
        pipelines = []
        if dtype in ['fp16', 'bf16']:
-            for mask, bias, lse, dropout in itertools.product(get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"], ["t", "f"]):
-                if hdim == 256:
-                # if True:
-                    pipelines.append(FmhaFwdPipeline('qr', 'row', 'f', 'f', 'f', 'f', bias, lse, dropout, squant, mask))
-                    pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', bias, lse, dropout, squant, mask))
-
-                    pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', bias, lse, dropout, squant, mask))
-                    pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 't', 't', 't', bias, lse, dropout, squant, mask))
+            for logits, mask, bias, lse, dropout, skip in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"], ["t", "f"], ["t", "f"]):
+                if hdim == 256 and hdim_v == 256:
+                    pipelines.append(FmhaFwdPipeline('qr', 'row', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip, 'f'))
+                    # the below two is used for hdim vectorize load
+                    pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip, 'f'))
+                    pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip, 'f'))
                else:
                    if bias == "bias":
                        # TODO: rocm 6.2 compiler problem if using qr_async for bias case
-                        pipelines.append(FmhaFwdPipeline('qr', 'row', 'f', 'f', 'f', 'f', bias, lse, dropout, squant, mask))
-                        pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', bias, lse, dropout, squant, mask))
-                        pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', bias, lse, dropout, squant, mask))
-                        pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 't', 't', 't', bias, lse, dropout, squant, mask))
+                        pipelines.append(FmhaFwdPipeline('qr', 'row', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip, 'f'))
+                        pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip, 'f'))
                    else:
-                        pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 'f', 't', 't', bias, lse, dropout, squant, mask))
-                        pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 't', 't', 't', bias, lse, dropout, squant, mask))
-                        pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 'f', 't', 't', bias, lse, dropout, squant, mask))
-                        pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 't', 't', 't', bias, lse, dropout, squant, mask))
+                        pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip, 'f'))
+                        pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip, 'f'))
+                        if (hdim, hdim_v) in [(64, 64), (128, 128)] and logits == "f" and bias == "no" and dropout == "f" and lse == "f" and skip == "f":
+                            pipelines.append(FmhaFwdPipeline('qr_async_trload', 'row', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip, 't'))
+                            pipelines.append(FmhaFwdPipeline('qr_async_trload', 'row', 'f', 'f', 't', 't', logits, bias, lse, dropout, squant, mask, skip, 't'))
                    if receipt == 1 and bias != "bias":
-                        pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', bias, lse, dropout, squant, mask)) # TODO: cover arbitraty hdim
-                        pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 'f', 't', 't', bias, lse, dropout, squant, mask)) # TODO: cover arbitraty hdim
+                        pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', logits, bias, lse, dropout, squant, mask, skip, 'f')) # TODO: cover arbitraty hdim
        elif dtype in ['fp8', 'bf8']:
            # no need lse/dropout kernels
-            for mask, bias in itertools.product(get_mask_map(mask_impl).keys(), BIAS_MAP.keys()):
-                pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', bias, 'f', 'f', squant, mask))
+            for logits, mask, bias in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys()):
+                pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', logits, bias, 'f', 'f', squant, mask, 'f', 'f'))
        elif dtype in ['fp8fp16', 'fp8bf16']:
            # TODO
            None
@@ -474,26 +600,45 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> Tuple[Fm
            assert False
        return pipelines

+class CustomFactory(KernelComponentFactory):
+    @staticmethod
+    def get_hdim_tile_size_dict(dtype : str) -> Optional[dict]:
+        result = KernelComponentFactory.get_hdim_tile_size_dict(dtype)
+        if dtype == 'fp16' or dtype == 'bf16':
+            if (128, 128) in result.keys():
+                result[(128, 128)].insert(0, FmhaFwdTileSize( 64, 128, 64, 128, 64,  128,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1, CppConstraint('get_num_blocks(128) < num_cus * min_cu_util_rate')))
+        return result
+
+def get_fwd_blobs(kernel_filter : Optional[str], receipt, optdim_list, mask_impl) -> Tuple[FmhaFwdApiPool, List[FmhaFwdKernel]]:
    gen = list()
    api_pool = FmhaFwdApiPool(mask_impl)

+    factory = CustomFactory if os.environ.get('CK_TILE_FMHA_FWD_CUSTOM_FACTORY', '0') == '1' else KernelComponentFactory
+
    for dtype in FWD_DTYPE_MAP.keys():
-        d = get_fmha_fwd_tile_dict_from_dtype(dtype)
+        d = factory.get_hdim_tile_size_dict(dtype)
        if d == None:
            continue
        #for hdim_str, mode, mask, bias, lse in itertools.product(d.keys(), MODE_MAP.keys(), MASK_MAP.keys(), ["t", "f"], ["t", "f"]):
-        for hdim_str, mode in itertools.product(d.keys(), MODE_MAP.keys()):
-            tile = d[hdim_str]
-            hdim = int(hdim_str)
-            for pipeline in get_pipelines(dtype, hdim):
+        for ((hdim, hdim_v), tiles), mode in itertools.product(d.items(), MODE_MAP.keys()):
+            for tile, pipeline in itertools.product(tiles, factory.get_pipelines(dtype, hdim, hdim_v, receipt, mask_impl)):
                if mode == "group":
                    if pipeline.F_spad != 't' or pipeline.F_skpad != 't':
                        # in group mode, spad/skpad must be true, since we can't predict if seqlen of current batch need pad or not
                        continue
-                if hdim == 192 and tile.F_bn1 == 128:
+                if (hdim, hdim_v) == (192, 128):
                    # NOTE: this is used to speedup deepseek prefill case, we don't gen training
-                    if pipeline.F_bias != 'no' or pipeline.F_lse == 't' or pipeline.F_dropout == 't':
+                    if pipeline.F_bias != 'no' or pipeline.F_dropout == 't':
                        continue
+                if pipeline.tag != 'qr_async_trload' and (((hdim, hdim_v) == (128, 128) and tile.F_bn0 != 128) or ((hdim, hdim_v) != (128, 128) and tile.F_bm0 != 128)):
+                    # non qr_async_trload only support km0=128 tile size when hdim is not 128
+                    # non qr_async only support kn0=128 tile size when hdim is 128
+                    continue
+                if pipeline.tag == 'qr_async_trload' and (((hdim, hdim_v) == (128, 128) and tile.F_bn0 == 128) or ((hdim, hdim_v) not in [(64, 64), (128, 128)])):
+                    continue
+                # logits_soft_cap is only allowed if no bias
+                if not ((pipeline.F_logits == 't' and pipeline.F_bias == 'no') or pipeline.F_logits == 'f'):
+                    continue
                k = FmhaFwdKernel(F_idx=0,
                                  F_hdim=hdim,
                                  F_dtype=dtype,
@@ -504,12 +649,16 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> Tuple[Fm
                if kernel_filter != '':
                    if not fnmatch.fnmatch(k.name, kernel_filter):
                        continue
+                if optdim_list != [-1]:
+                    if hdim not in optdim_list:
+                        continue
                # 2 - Flash attention integration
                if receipt in (2, 3):
                    cond = dtype in ['fp16', 'bf16']
                    cond &= pipeline.F_vlayout == 'row'
                    cond &= pipeline.F_bias in ['no', 'alibi']
                    cond &= pipeline.F_squant == 'f'
+                    cond &= pipeline.F_skip == 'f'
                    if not cond:
                        continue
                # PyTorch integration
@@ -518,6 +667,9 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> Tuple[Fm
                    cond &= pipeline.F_vlayout == 'row'
                    cond &= pipeline.F_bias in ['no', 'bias']
                    cond &= pipeline.F_squant == 'f'
+                    cond &= mode == 'batch'
+                    cond &= pipeline.F_skip == 'f'
+                    cond &= pipeline.F_logits == 'f'
                    if not cond:
                        continue
                # Aiter(mha_fwd) integration
@@ -536,26 +688,34 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> Tuple[Fm
                    cond &= pipeline.F_squant == 'f'
                    if not cond:
                        continue
+                # aiter::mha_fwd C++ api integration
+                elif receipt == 600:
+                    cond = dtype in ['fp16', 'bf16']
+                    cond &= pipeline.F_vlayout == 'row'
+                    cond &= pipeline.F_squant == 'f'
+                    if not cond:
+                        continue
+
                api_pool.register_traits(k.api_trait())
                gen.append(k)

    return (api_pool, gen)

 def write_single_fwd_kernel(kernel: FmhaFwdKernel, autogen_dir: Path) -> None:
-    (autogen_dir / kernel.filename).write_text(kernel.template)
+    update_file(autogen_dir / kernel.filename, kernel.template)

 def write_fwd_api(api_pool : FmhaFwdApiPool, autogen_dir: Path) -> None:
-    (autogen_dir / FMHA_FWD_API_FILENAME).write_text(api_pool.api)
+    update_file(autogen_dir / FMHA_FWD_API_FILENAME, api_pool.api)

-def write_blobs(output_dir : Path, kernel_filter : str, receipt, mask_impl) -> None:
-    api_pool, kernels = get_fwd_blobs(kernel_filter, receipt, mask_impl)
+def write_blobs(output_dir : Path, kernel_filter : str, receipt, optdim_list, mask_impl) -> None:
+    api_pool, kernels = get_fwd_blobs(kernel_filter, receipt, optdim_list, mask_impl)
    for kernel in kernels:
        write_single_fwd_kernel(kernel, output_dir)
    write_fwd_api(api_pool, output_dir)

-def list_blobs(file_path : Path, kernel_filter : str, receipt, mask_impl) -> None:
+def list_blobs(file_path : Path, kernel_filter : str, receipt, optdim_list, mask_impl) -> None:
    with file_path.open('a') as f:
-        _, kernels = get_fwd_blobs(kernel_filter, receipt, mask_impl)
+        _, kernels = get_fwd_blobs(kernel_filter, receipt, optdim_list, mask_impl)
        for kernel in kernels:
            f.write(str(file_path.parent / GEN_DIR / kernel.filename) + "\n")
        f.write(str(file_path.parent / GEN_DIR / FMHA_FWD_API_FILENAME) + "\n")
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py
@@ -273,7 +273,7 @@ def get_fmha_fwd_appendkv_tile_dict_from_dtype(dtype : str) -> Optional[dict]:
    else:
        return None

-def get_fwd_appendkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> Tuple[FmhaFwdAppendKVApiPool, List[FmhaFwdAppendKVKernel]]:
+def get_fwd_appendkv_blobs(kernel_filter : Optional[str], receipt, mask_impl, optdim_list) -> Tuple[FmhaFwdAppendKVApiPool, List[FmhaFwdAppendKVKernel]]:
    # TODO: we don't support tuning yet, so pick up one value for vlayout/pipeline/pad
    #       support this in future
    def get_pipelines(dtype, hdim) -> List[FmhaFwdAppendKVPipeline]:
@@ -326,12 +326,21 @@ def get_fwd_appendkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
                if kernel_filter != '':
                    if not fnmatch.fnmatch(k.name, kernel_filter):
                        continue
+                if optdim_list != [-1]:
+                    if hdim not in optdim_list:
+                        continue
                # 2 - Flash attention integration
                if receipt == 2:
                    cond = dtype in ['fp16', 'bf16']
                    cond &= pipeline.F_vlayout == 'row'
                    if not cond:
                        continue
+                # PyTorch integration
+                elif receipt == 4:
+                    cond = dtype in ['fp16', 'bf16']
+                    cond &= pipeline.F_vlayout == 'row'
+                    if not cond:
+                        continue
                api_pool.register_traits(k.api_trait())
                gen.append(k)

@@ -343,15 +352,15 @@ def write_single_kernel(kernel: FmhaFwdAppendKVKernel, autogen_dir: Path) -> Non
 def write_fwd_appendkv_api(api_pool : FmhaFwdAppendKVApiPool, autogen_dir: Path) -> None:
    (autogen_dir / FMHA_FWD_APPENDKV_API_FILENAME).write_text(api_pool.api)

-def write_blobs(output_dir : Path, kernel_filter : Optional[str], receipt, mask_impl) -> None:
-    api_pool, kernels = get_fwd_appendkv_blobs(kernel_filter, receipt, mask_impl)
+def write_blobs(output_dir : Path, kernel_filter : Optional[str], receipt, optdim_list, mask_impl) -> None:
+    api_pool, kernels = get_fwd_appendkv_blobs(kernel_filter, receipt, mask_impl, optdim_list)
    for kernel in kernels:
        write_single_kernel(kernel, output_dir)
    write_fwd_appendkv_api(api_pool, output_dir)

-def list_blobs(file_path : Path, kernel_filter : Optional[str], receipt, mask_impl) -> None:
+def list_blobs(file_path : Path, kernel_filter : Optional[str], receipt, optdim_list, mask_impl) -> None:
    with file_path.open('a') as f:
-        _, kernels = get_fwd_appendkv_blobs(kernel_filter, receipt, mask_impl)
+        _, kernels = get_fwd_appendkv_blobs(kernel_filter, receipt, mask_impl, optdim_list)
        for kernel in kernels:
            f.write(str(file_path.parent / GEN_DIR / kernel.filename) + "\n")
        f.write(str(file_path.parent / GEN_DIR / FMHA_FWD_APPENDKV_API_FILENAME) + "\n")
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
@@ -34,17 +34,18 @@ K0_MAX_SUBMAX_MAP = {
    64 : 64,
    96 : 128,
    128: 128,
+    # 160: 160,
    256: 256
 }

 FMHA_FWD_SPLITKV_PIPELINE_MAP = {
    "qr" : "ck_tile::BlockFmhaFwdSplitKVPipelineQRKSVS",
    "qr_nwarp_sshuffle" : "ck_tile::BlockFmhaFwdSplitKVPipelineNWarpSShuffleQRKSVS",
-    "qr_async" : "ck_tile::BlockFmhaFwdSplitKVPipelineQRKSVSAsync",
 }

 FMHA_FWD_SPLITKV_KERNEL_BODY="""
 using fmha_dtype_{F_idx} = {F_dtype};
+using fmha_variant_{F_idx} = ck_tile::ComposedAttention<{F_logits} * ck_tile::LOGITS_SOFT_CAP, CK_TILE_FMHA_FWD_FAST_EXP2>;
 using fmha_mask_{F_idx} = {F_mask};

 namespace {{
@@ -63,6 +64,7 @@ using fmha_trait = ck_tile::TileFmhaFwdSplitKVTraits<{F_spad},
                                                     {F_skpad},
                                                     {F_dpad},
                                                     {F_dvpad},
+                                                     {F_logits},
                                                     {F_bias},
                                                     /*kHasBiasGrad=*/false,
                                                     {F_lse},
@@ -85,16 +87,19 @@ using fmha_pipeline_problem = ck_tile::BlockFmhaFwdSplitKVPipelineProblem<
    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::OaccDataType,
    fmha_shape,
    {F_mode},
+    fmha_variant_{F_idx},
    fmha_mask_{F_idx},
    fmha_trait>;

 using fmha_pipeline = {F_pipeline}<
    fmha_pipeline_problem>;

+/// FIXME: use {F_spad}/{F_dvpad} as kPadM/kPadN parameters after solving
+///        store_tile_raw() data corruption issue
 using fmha_epilogue =
    ck_tile::Default2DEpilogue<ck_tile::Default2DEpilogueProblem<typename FmhaFwdTypeConfig<{F_dtype}>::OaccDataType,
                                           typename FmhaFwdTypeConfig<{F_dtype}>::OaccDataType,
-                                           {F_spad}, {F_dvpad}>>;
+                                           false, false>>;

 using fmha_kernel =
    ck_tile::FmhaFwdSplitKVKernel<fmha_pipeline, fmha_epilogue>;
@@ -111,7 +116,7 @@ static void run(const ck_tile::stream_config& s, fmha_fwd_splitkv_args a)
 }}

 using trait_{F_idx} = fmha_fwd_splitkv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout},
-                        {F_pipeline_enum}, fmha_mask_{F_idx}, {F_bias}, {F_lse}, {F_squant}, {F_pagedkv}, {F_spad}, {F_skpad}, {F_dpad},
+                        {F_pipeline_enum}, {F_logits}, fmha_mask_{F_idx}, {F_bias}, {F_lse}, {F_squant}, {F_pagedkv}, {F_spad}, {F_skpad}, {F_dpad},
                        {F_dvpad}>;

 #include <iostream>
@@ -265,9 +270,9 @@ float fmha_fwd_splitkv(fmha_fwd_splitkv_traits t, fmha_fwd_splitkv_args a, const
 }}
 """

-FMHA_FWD_SPLITKV_API_INNER_DISPATCH="""            {F_if}((t.is_group_mode == {F_mode}) && (t.is_v_rowmajor == {F_vlayout}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.do_fp8_static_quant == {F_squant}) &&
+FMHA_FWD_SPLITKV_API_INNER_DISPATCH="""            {F_if}((t.is_group_mode == {F_mode}) && (t.is_v_rowmajor == {F_vlayout}) && (t.has_logits_soft_cap == {F_logits}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.do_fp8_static_quant == {F_squant}) &&
                        ((a.block_table_ptr != nullptr) == {F_pagedkv}) && ({F_scheck}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck})) {{
-                using traits_ = fmha_fwd_splitkv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, {F_mask}, {F_bias}, true, {F_squant}, {F_pagedkv}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>;
+                using traits_ = fmha_fwd_splitkv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, {F_logits}, {F_mask}, {F_bias}, true, {F_squant}, {F_pagedkv}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>;

                // get combine kernel tile sizes
                using OaccDataType = typename FmhaFwdTypeConfig<{F_dtype}>::OaccDataType;
@@ -308,6 +313,7 @@ class FmhaFwdSplitKVApiTrait:
    bk0max    : int
    vlayout   : str
    mask      : str
+    logits    : str
    bias      : str  #
    lse       : str  #
    squant    : str  #
@@ -320,7 +326,7 @@ class FmhaFwdSplitKVApiTrait:
    @property
    def name(self) -> str:
        return f'{self.hdim}-{self.dtype}-{self.mode}-{self.bm0}-{self.bn0}-{self.bk0}-{self.bn0}-{self.bk1}-{self.bk0max}-'+\
-                    f'{self.vlayout}-{self.mask}-{self.bias}-{self.lse}-{self.squant}-{self.spad}-{self.skpad}-{self.dpad}-'+\
+                    f'{self.vlayout}-{self.logits}-{self.mask}-{self.bias}-{self.lse}-{self.squant}-{self.spad}-{self.skpad}-{self.dpad}-'+\
                    f'{self.dvpad}-{self.pagedkv}'

    @property
@@ -378,6 +384,7 @@ class FmhaFwdSplitKVPipeline:
    F_skpad     : str  #
    F_dpad      : str  #
    F_dvpad     : str  #
+    F_logits    : str  # t/f
    F_bias      : str  # true/false
    F_lse       : str  #
    F_squant    : str  #
@@ -399,6 +406,9 @@ class FmhaFwdSplitKVPipeline:
        if pn != '' : n += f'_{pn}'
        else: n += '_npad'

+        if self.F_logits == 't' : n += '_logits'
+        else: n += '_nlogits'
+
        if self.F_bias != 'no' : n += f'_{self.F_bias}'
        else: n += '_nbias'

@@ -440,10 +450,10 @@ class FmhaFwdSplitKVCombinePipeline:
        n = f'{self.tag}'
        if pn != '' : n += f'_{pn}'
        else: n += '_npad'
-        
+
        if self.F_lse == 't' : n += '_lse'
        else: n += '_nlse'
-        
+
        if self.F_squant == 't' : n += '_squant'
        else: n += '_nsquant'
        return n
@@ -473,7 +483,7 @@ class FmhaFwdSplitKVApiPool:
                for k, trait in enumerate(traits):
                    if_k = 'if' if k == 0 else 'else if'
                    inners = inners + FMHA_FWD_SPLITKV_API_INNER_DISPATCH.format(F_if=if_k, F_mode=MODE_MAP[trait.mode], F_vlayout=LAYOUT_MAP[trait.vlayout],
-                                   F_pipeline_enum=PIPELINE_ENUM_MAP[trait.pipeline_tag], F_mask=get_mask_map(self.mask_impl)[trait.mask],
+                                   F_pipeline_enum=PIPELINE_ENUM_MAP[trait.pipeline_tag], F_logits=BOOL_MAP[trait.logits], F_mask=get_mask_map(self.mask_impl)[trait.mask],
                                   F_mask_check=get_mask_check_map(self.mask_impl)[trait.mask], F_bias_check=BIAS_CHECK_MAP[trait.bias], F_bias=BIAS_MAP[trait.bias],
                                   F_lse=BOOL_MAP[trait.lse], F_squant=BOOL_MAP[trait.squant], F_pagedkv=BOOL_MAP[trait.pagedkv],
                                   F_scheck=trait.scheck, F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck,
@@ -539,6 +549,7 @@ class FmhaFwdSplitKVKernel:
                F_skpad         = BOOL_MAP[self.F_pipeline.F_skpad],
                F_dpad          = BOOL_MAP[self.F_pipeline.F_dpad],
                F_dvpad         = BOOL_MAP[self.F_pipeline.F_dvpad],
+                F_logits        = BOOL_MAP[self.F_pipeline.F_logits],
                F_bias          = BIAS_MAP[self.F_pipeline.F_bias],
                F_lse           = BOOL_MAP[self.F_pipeline.F_lse],
                F_squant        = BOOL_MAP[self.F_pipeline.F_squant],
@@ -572,6 +583,7 @@ class FmhaFwdSplitKVKernel:
                bk1=self.F_tile.F_bk1,
                bk0max=self.F_tile.F_bk0max,
                vlayout=self.F_pipeline.F_vlayout,
+                logits=self.F_pipeline.F_logits,
                mask=self.F_pipeline.F_mask,
                bias=self.F_pipeline.F_bias,
                lse=self.F_pipeline.F_lse,
@@ -624,8 +636,9 @@ def get_fmha_fwd_tile_dict_from_dtype(dtype : str) -> Optional[dict]:
        return {
            '32'  : FmhaFwdTileSize(32, 64,  16, 32,  32,  32,   2, 1, 1,  2, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
            '64'  : FmhaFwdTileSize(64, 64,  32, 64,  32,  64,   4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
-        ### '96'  : FmhaFwdTileSize(64, 128, 32, 128, 32,  96,   4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
+            '96'  : FmhaFwdTileSize(64, 128, 32, 128, 32,  96,   4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
            '128' : FmhaFwdTileSize(64, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
+            # '160' : FmhaFwdTileSize(64, 128, 32, 160, 32,  160,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
            '256' : FmhaFwdTileSize(64, 128, 32, 256, 32,  256,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
        }
    elif dtype == 'fp8' or dtype == 'bf8':
@@ -642,8 +655,9 @@ def get_fmha_fwd_splitkv_combine_tile_dict_from_dtype(dtype : str) -> Optional[d
        return {
            '32'  : FmhaFwdSplitKVCombineTileSize(32,  -1),
            '64'  : FmhaFwdSplitKVCombineTileSize(32,  -1),
-        ### '96'  : FmhaFwdSplitKVCombineTileSize(32,  -1),
+            '96'  : FmhaFwdSplitKVCombineTileSize(32,  -1),
            '128' : FmhaFwdSplitKVCombineTileSize(32,  -1),
+            # '160' : FmhaFwdSplitKVCombineTileSize(32,  -1),
            '256' : FmhaFwdSplitKVCombineTileSize(32,  -1),
    }
    elif dtype == 'fp8' or dtype == 'bf8':
@@ -655,7 +669,7 @@ def get_fmha_fwd_splitkv_combine_tile_dict_from_dtype(dtype : str) -> Optional[d
    else:
        return None

-def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> Tuple[FmhaFwdSplitKVApiPool, List[FmhaFwdSplitKVKernel]]:
+def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl, optdim_list) -> Tuple[FmhaFwdSplitKVApiPool, List[FmhaFwdSplitKVKernel]]:
    Pipeline = FmhaFwdSplitKVPipeline
    Kernel = FmhaFwdSplitKVKernel

@@ -669,26 +683,21 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
        squant = 't' if dtype == 'fp8' else 'f'
        pipelines = []
        if dtype in ['fp16', 'bf16']:
-            for mask, bias, pagedkv in itertools.product(get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"]):
-                # TODO: use async pipeline when compiler is more stable
-                if hdim == 256 or hdim in [32, 64, 128]:         ### [32, 64, 96, 128]:
-                # if True:
-                    pipelines.append(Pipeline('qr', 'row', 'f', 't', 'f', 'f', bias, 't', squant, pagedkv, mask))
-                    pipelines.append(Pipeline('qr', 'col', 'f', 't', 'f', 'f', bias, 't', squant, pagedkv, mask))
+            for logits, mask, bias, pagedkv in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"]):
+                pipelines.append(Pipeline('qr', 'row', 'f', 't', 'f', 'f', logits, bias, 't', squant, pagedkv, mask))
+                pipelines.append(Pipeline('qr', 'col', 'f', 't', 'f', 'f', logits, bias, 't', squant, pagedkv, mask))

-                    pipelines.append(Pipeline('qr', 'row', 't', 't', 't', 't', bias, 't', squant, pagedkv, mask))
-                    pipelines.append(Pipeline('qr', 'col', 't', 't', 't', 't', bias, 't', squant, pagedkv, mask))
-                else:
-                    pipelines.append(Pipeline('qr_async', 'row', 't', 'f', 't', 't', bias, 't', squant, pagedkv, mask))
-                    pipelines.append(Pipeline('qr_async', 'row', 't', 't', 't', 't', bias, 't', squant, pagedkv, mask))
-                    pipelines.append(Pipeline('qr_async', 'col', 't', 'f', 't', 't', bias, 't', squant, pagedkv, mask))
-                    pipelines.append(Pipeline('qr_async', 'col', 't', 't', 't', 't', bias, 't', squant, pagedkv, mask))
-                    if receipt == 1:
-                        pipelines.append(Pipeline('qr', 'row', 't', 't', 't', 't', bias, 't', squant, pagedkv, mask)) # TODO: cover arbitraty hdim
-                        pipelines.append(Pipeline('qr', 'col', 't', 'f', 't', 't', bias, 't', squant, pagedkv, mask)) # TODO: cover arbitraty hdim
+                pipelines.append(Pipeline('qr', 'row', 't', 'f', 'f', 'f', logits, bias, 't', squant, pagedkv, mask))
+                pipelines.append(Pipeline('qr', 'col', 't', 'f', 'f', 'f', logits, bias, 't', squant, pagedkv, mask))
+
+                pipelines.append(Pipeline('qr', 'row', 't', 't', 'f', 'f', logits, bias, 't', squant, pagedkv, mask))
+                pipelines.append(Pipeline('qr', 'col', 't', 't', 'f', 'f', logits, bias, 't', squant, pagedkv, mask))
+
+                pipelines.append(Pipeline('qr', 'row', 't', 't', 't', 't', logits, bias, 't', squant, pagedkv, mask))
+                pipelines.append(Pipeline('qr', 'col', 't', 't', 't', 't', logits, bias, 't', squant, pagedkv, mask))
        elif dtype in ['fp8', 'bf8']:
-            for mask, bias in itertools.product(get_mask_map(mask_impl).keys(), BIAS_MAP.keys()):
-                pipelines.append(Pipeline('qr', 'col', 'f', 'f', 'f', 'f', bias, 't', squant, 'f', mask))
+            for logits, mask, bias in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys()):
+                pipelines.append(Pipeline('qr', 'col', 'f', 'f', 'f', 'f', logits, bias, 't', squant, 'f', mask))
        elif dtype in ['fp8fp16', 'fp8bf16']:
            # TODO
            None
@@ -712,6 +721,9 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
                    if pipeline.F_spad != 't' or pipeline.F_skpad != 't':
                        # in group mode, spad/skpad must be true, since we can't predict if seqlen of current batch need pad or not
                        continue
+                # logits_soft_cap is only allowed if no bias
+                if not ((pipeline.F_logits == 't' and pipeline.F_bias == 'no') or pipeline.F_logits == 'f'):
+                    continue
                k = Kernel(F_idx=0,
                           F_hdim=hdim,
                           F_dtype=dtype,
@@ -722,6 +734,9 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
                if kernel_filter != '':
                    if not fnmatch.fnmatch(k.name, kernel_filter):
                        continue
+                if optdim_list != [-1]:
+                    if hdim not in optdim_list:
+                        continue
                # Flash attention integration
                if receipt == 2:
                    cond = dtype in ['fp16', 'bf16']
@@ -730,6 +745,15 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
                    cond &= pipeline.F_squant == 'f'
                    if not cond:
                        continue
+                # PyTorch integration
+                elif receipt == 4:
+                    cond = dtype in ['fp16, bf16']
+                    cond &= pipeline.F_vlayout == 'row'
+                    cond &= pipeline.F_bias in ['no', 'bias']
+                    cond &= pipeline.F_squant == 'f'
+                    cond &= mode == 'batch'
+                    if not cond:
+                        continue
                # Aiter(mha_varlen_fwd) integration
                elif receipt == 200:
                    cond = dtype in ['fp16', 'bf16']
@@ -738,12 +762,19 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
                    cond &= pipeline.F_squant == 'f'
                    if not cond:
                        continue
+                # aiter::mha_fwd_splikv C++ api integration
+                elif receipt == 600:
+                    cond = dtype in ['fp16', 'bf16']
+                    cond &= pipeline.F_vlayout == 'row'
+                    cond &= pipeline.F_squant == 'f'
+                    if not cond:
+                        continue
                api_pool.register_traits(k.api_trait())
                gen.append(k)

    return (api_pool, gen)

-def get_fwd_splitkv_combine_blobs(kernel_filter : Optional[str], receipt) -> List[FmhaFwdSplitKVCombineKernel]:
+def get_fwd_splitkv_combine_blobs(kernel_filter : Optional[str], receipt, optdim_list) -> List[FmhaFwdSplitKVCombineKernel]:
    Pipeline = FmhaFwdSplitKVCombinePipeline
    Kernel = FmhaFwdSplitKVCombineKernel

@@ -790,12 +821,20 @@ def get_fwd_splitkv_combine_blobs(kernel_filter : Optional[str], receipt) -> Lis
                if kernel_filter != '':
                    if not fnmatch.fnmatch(k.name, kernel_filter):
                        continue
+                if optdim_list != [-1]:
+                    if hdim not in optdim_list:
+                        continue
                # Aiter(mha_varlen_fwd) integration
                if receipt == 200:
                    cond = dtype in ['fp16', 'bf16']
                    cond &= mode == "group"
                    if not cond:
                        continue
+                # aiter::mha_fwd_splikv C++ api integration
+                elif receipt == 600:
+                    cond = dtype in ['fp16', 'bf16']
+                    if not cond:
+                        continue
                gen.append(k)

    return gen
@@ -807,27 +846,27 @@ def write_fwd_splitkv_api(api_pool : FmhaFwdSplitKVApiPool, autogen_dir: Path) -
    file_path = autogen_dir / FMHA_FWD_SPLITKV_API_FILENAME
    file_path.write_text(api_pool.api)

-def write_blobs(output_dir : Path, filter_list : str, receipt, mask_impl) -> None:
+def write_blobs(output_dir : Path, filter_list : str, receipt, optdim_list, mask_impl) -> None:
    filter_list = filter_list.split('@')
    filter_list.extend([''] * (2 - len(filter_list)))

-    kernels = get_fwd_splitkv_combine_blobs(filter_list[0], receipt)
+    kernels = get_fwd_splitkv_combine_blobs(filter_list[0], receipt, optdim_list)
    for kernel in kernels:
        write_single_kernel(kernel, output_dir)
-    api_pool, kernels = get_fwd_splitkv_blobs(filter_list[1], receipt, mask_impl)
+    api_pool, kernels = get_fwd_splitkv_blobs(filter_list[1], receipt, mask_impl, optdim_list)
    for kernel in kernels:
        write_single_kernel(kernel, output_dir)
    write_fwd_splitkv_api(api_pool, output_dir)

-def list_blobs(file_path : Path, filter_list : str, receipt, mask_impl) -> None:
+def list_blobs(file_path : Path, filter_list : str, receipt, optdim_list, mask_impl) -> None:
    filter_list = filter_list.split('@')
    filter_list.extend([''] * (2 - len(filter_list)))

    with file_path.open('a') as f:
-        kernels = get_fwd_splitkv_combine_blobs(filter_list[0], receipt)
+        kernels = get_fwd_splitkv_combine_blobs(filter_list[0], receipt, optdim_list)
        for kernel in kernels:
            f.write(str(file_path.parent / GEN_DIR / kernel.filename) + "\n")
-        _, kernels = get_fwd_splitkv_blobs(filter_list[1], receipt, mask_impl)
+        _, kernels = get_fwd_splitkv_blobs(filter_list[1], receipt, mask_impl, optdim_list)
        for kernel in kernels:
            f.write(str(file_path.parent / GEN_DIR / kernel.filename) + "\n")
        f.write(str(file_path.parent / GEN_DIR / FMHA_FWD_SPLITKV_API_FILENAME) + "\n")
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_pagedkv_prefill.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_pagedkv_prefill.py
@@ -0,0 +1,585 @@
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+# generate kernel instances to speed up compilation
+
+import copy
+from dataclasses import dataclass
+import fnmatch
+import itertools
+from pathlib import Path
+from typing import List, Optional, Tuple
+
+from codegen.cmake_config import *
+from codegen.cpp_symbol_map import *
+
+
+DTYPE_BITS = {
+    "fp32": 32,
+    "fp16": 16,
+    "bf16": 16,
+    "fp8" : 8,
+    "bf8" : 8
+}
+
+K0_MAX_SUBMAX_MAP = {
+    32 : 32,
+    64 : 64,
+    96 : 128,
+    128: 128,
+    256: 256
+}
+
+FMHA_FWD_PAGEDKV_PIPELINE_MAP = {
+    "qr_pagedkv" : "ck_tile::BlockFmhaFwdPagedKVPipelineQRKSVS"
+}
+
+FMHA_FWD_KERNEL_HEADER = """// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.\n
+// auto generated by generate.py
+#include "ck_tile/ops/fmha/block/variants.hpp"
+#include "fmha_fwd.hpp"
+"""
+
+FMHA_FWD_KERNEL_BODY="""
+using fmha_dtype_{F_idx} = {F_dtype};
+
+using fmha_block_tile_{F_idx} = ck_tile::sequence<{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}>;
+
+using fmha_shape_{F_idx} = ck_tile::TileFmhaShape<fmha_block_tile_{F_idx},
+                                      ck_tile::sequence<{F_rm0}, {F_rn0}, {F_rk0}>,
+                                      ck_tile::sequence<{F_wm0}, {F_wn0}, {F_wk0}>,
+                                      ck_tile::sequence<{F_rm1}, {F_rn1}, {F_rk1}>,
+                                      ck_tile::sequence<{F_wm1}, {F_wn1}, {F_wk1}>,
+                                      {F_vlayout}>;
+
+using fmha_trait_{F_idx} = ck_tile::TileFmhaFwdPagedKVTraits<{F_spad},
+                                                             {F_skpad},
+                                                             {F_dpad},
+                                                             {F_dvpad},
+                                                             {F_logits},
+                                                             {F_bias},
+                                                             false,
+                                                             {F_lse},      //lse
+                                                             {F_pagedkv},  //pagedkv
+                                                             {F_squant},
+                                                             {F_occupancy},
+                                                             {F_skip}>;
+
+using fmha_variant_{F_idx} = ck_tile::ComposedAttention<{F_logits} * ck_tile::LOGITS_SOFT_CAP, CK_TILE_FMHA_FWD_FAST_EXP2>;
+
+using fmha_mask_{F_idx} = {F_mask};
+
+using fmha_pipeline_problem_{F_idx} = ck_tile::BlockFmhaFwdPagedKVPipelineProblem<
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::QDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::KDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::VDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::SaccDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::SMPLComputeDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::BiasDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::LSEDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::PDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::OaccDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::ODataType,
+    fmha_shape_{F_idx},
+    {F_mode},
+    fmha_variant_{F_idx},
+    fmha_mask_{F_idx},
+    fmha_trait_{F_idx}>;
+
+using fmha_pipeline_{F_idx} = {F_pipeline}<
+    fmha_pipeline_problem_{F_idx}>;
+
+using fmha_epilogue_{F_idx} =
+    ck_tile::Default2DEpilogue<ck_tile::Default2DEpilogueProblem<typename FmhaFwdTypeConfig<{F_dtype}>::OaccDataType,
+                                           typename FmhaFwdTypeConfig<{F_dtype}>::ODataType,
+                                           {F_spad}, {F_dvpad}>>;
+
+using fmha_kernel_{F_idx} =
+    ck_tile::FmhaFwdPagedKVKernel<fmha_pipeline_{F_idx}, fmha_epilogue_{F_idx}>;
+
+using trait_{F_idx} = fmha_fwd_pagedkv_traits_<{F_hdim}, {F_dtype}, {F_mode},{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout},
+                        {F_pipeline_enum}, {F_logits}, fmha_mask_{F_idx}, {F_bias}, {F_lse}, {F_pagedkv}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_skip}>;
+
+#include <iostream>
+
+template<>
+float fmha_fwd_pagedkv_<trait_{F_idx}>(const ck_tile::stream_config& s, fmha_fwd_pagedkv_args a)
+{{
+    using k_ = fmha_kernel_{F_idx};
+    if(s.log_level_ > 0)
+        std::cout << ", " << k_::GetName() << std::flush;
+    auto [kargs, grids] = fmha_fwd_pagedkv_create_kargs_and_grids<k_>(a);
+    constexpr dim3 blocks             = k_::BlockSize();
+    constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu;
+    return ck_tile::launch_kernel(s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(k_{{}}, grids, blocks, 0, kargs));
+}}
+"""
+
+FMHA_FWD_API_FILENAME="fmha_fwd_pagedkv_api.cpp"
+FMHA_FWD_API="""
+float fmha_fwd_pagedkv(fmha_fwd_pagedkv_traits& t, fmha_fwd_pagedkv_args& a, const ck_tile::stream_config& s){{
+    float r = -1;
+{F_dispatch}
+    return r;
+}}
+"""
+
+FMHA_FWD_API_PER_DTYPE="""    {F_if}(t.data_type.compare(\"{F_dtype}\") == 0){{
+{F_hdim_case}
+    }}
+"""
+FMHA_FWD_API_PER_HDIM_CASE="""        {F_if} (t.hdim_q <= {F_hdim} && t.hdim_v <= {F_hdim_v}) {{
+{F_inner_dispatch}
+        }}
+"""
+
+FMHA_FWD_API_INNER_DISPATCH="""            {F_if}((t.is_group_mode == {F_mode}) && (t.is_v_rowmajor == {F_vlayout}) && (t.has_logits_soft_cap == {F_logits}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.has_lse == {F_lse})  && (t.use_pagedkv == {F_pagedkv}) && (t.do_fp8_static_quant == {F_squant}) && (t.skip_min_seqlen_q == {F_skip}) &&
+                        ({F_scheck}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck})) {{
+                using trait_ = fmha_fwd_pagedkv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, {F_logits}, {F_mask}, {F_bias}, {F_lse}, {F_pagedkv}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_skip}>;
+                return fmha_fwd_pagedkv_<trait_>(s, a);
+            }}
+"""
+
+@dataclass
+class FmhaFwdApiTrait:
+    pipeline_tag : str
+    # sync with fmha_fwd_traits<>, to generate fallback calls
+    hdim      : str
+    dtype     : str  # data type
+    mode      : str  # value from MODE_MAP
+    bm0       : int  # tile size along q seqlen (block size)
+    bn0       : int  # tile size along qk seqlen
+    bk0       : int  # tile size along qk gemm unroll
+    bn1       : int  # tile size along v head_dim
+    bk1       : int  # tile size along kv gemm unroll
+    bk0max    : int
+    vlayout   : str
+    logits    : str
+    mask      : str
+    bias      : str  #
+    lse       : str  #
+    pagedkv   : str
+    squant    : str  #
+    spad      : str
+    skpad     : str
+    dpad      : str
+    dvpad     : str
+    skip      : str
+
+    @property
+    def name(self) -> str:
+        return f'{self.hdim}-{self.dtype}-{self.mode}-{self.bm0}-{self.bn0}-{self.bk0}-{self.bn0}-{self.bk1}-{self.bk0max}-'+\
+                    f'{self.vlayout}-{self.logits}-{self.mask}-{self.bias}-{self.lse}-{self.pagedkv}-{self.squant}-{self.spad}-{self.skpad}-{self.dpad}-{self.dvpad}-{self.skip}'
+
+    @property
+    def scheck(self) -> str:
+        if self.mode == 'group': return 'true/*group mode spad always true*/'                  # group mode only generate spad/skpad == true
+        if self.pipeline_tag == 'qr_async':
+            if self.spad == 't' : return 'true' # always support
+            else :                return 'true'
+        elif self.pipeline_tag in ['qr_pagedkv', 'qs']:
+            if self.spad == 't' : return f'true /*a.seqlen_q % {self.bm0} != 0*/'  # TODO: order of get_pipelines() matters! (ugly)
+            else :                return f'a.seqlen_q % {self.bm0} == 0'
+        else: assert False
+
+    @property
+    def skcheck(self) -> str:
+        if self.mode == 'group': return 'true/*group mode skpad always true*/'                  # group mode only generate spad/skpad == true
+        if self.pipeline_tag == 'qr_async':
+            if self.skpad == 't' : return f'a.seqlen_k == 0 || a.seqlen_k % {self.bn0} != 0'
+            else :                 return f'a.seqlen_k != 0 && a.seqlen_k % {self.bn0} == 0'
+        elif self.pipeline_tag in ['qr_pagedkv', 'qs']:
+            if self.skpad == 't' : return f'true /*a.seqlen_k % {self.bn0} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
+            else :                return f'a.seqlen_k % {self.bn0} == 0'
+        else: assert False
+
+    @property
+    def dcheck(self) -> str:
+        if self.pipeline_tag == 'qr_async':
+            vec = int((32 * 4) / DTYPE_BITS[self.dtype])
+            if self.dpad == 't': return f'a.hdim_q % {vec} == 0'
+            else :               assert False
+        elif self.pipeline_tag in ['qr_pagedkv', 'qs']:
+            bk0submax = K0_MAX_SUBMAX_MAP[self.bk0max]
+            if self.dpad == 't': return f'true /*a.hdim_q % {bk0submax} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
+            else :               return f'a.hdim_q % {bk0submax} == 0'
+        else:   assert False
+
+    @property
+    def dvcheck(self) -> str:
+        if self.pipeline_tag == 'qr_async':
+            vec = int((32 * 4) / DTYPE_BITS[self.dtype])
+            if self.dvpad == 't': return f'a.hdim_v % {vec} == 0'
+            else :                assert False
+        elif self.pipeline_tag in ['qr_pagedkv', 'qs']:
+            bk0submax = K0_MAX_SUBMAX_MAP[self.bk0max]
+            if self.dvpad == 't': return f'true /*a.hdim_v % {bk0submax} != 0*/' # TODO: order of get_pipelines() matters! (ugly)
+            else :                return f'a.hdim_v % {bk0submax} == 0'
+        else:   assert False
+
+@dataclass
+class FmhaFwdPipeline:
+    tag : str
+
+    F_vlayout   : str  # row/col
+    F_spad      : str  # true/false
+    F_skpad     : str  #
+    F_dpad      : str  #
+    F_dvpad     : str  #
+    F_logits    : str  # t/f
+    F_bias      : str  # true/false
+    F_lse       : str  #
+    F_pagedkv   : str  #
+    F_squant    : str  #
+    F_mask      : str  # value from MASK_MAP
+    F_skip      : str  # true/false
+
+    @property
+    def name(self) -> str:
+        def pad_name() -> str:
+            n = ''
+            if self.F_spad == 't': n += 's'
+            if self.F_skpad == 't' : n += 'sk'
+            if self.F_dpad == 't' : n += 'd'
+            if self.F_dvpad == 't' : n += 'dv'
+            if n != '' : n = 'p' + n
+            return n
+        pn = pad_name()
+        n = f'{self.tag}_v{self.F_vlayout[0]}'
+        if pn != '' : n += f'_{pn}'
+        else: n += '_npad'
+
+        if self.F_logits == 't' : n += '_logits'
+        else: n += '_nlogits'
+
+        if self.F_bias != 'no' : n += f'_{self.F_bias}'
+        else: n += '_nbias'
+
+        if self.F_mask[0:2] == 's_':
+            if self.F_mask == 's_mask': n += f'_mask'
+            else: n += '_nmask'
+        else:
+            if self.F_mask != 'no' : n += f'_m{self.F_mask[0]}'
+            else: n += '_nmask'
+
+        if self.F_lse == 't' : n += '_lse'
+        else: n += '_nlse'
+
+        if self.F_skip == 't' : n += '_skip'
+        else: n += '_nskip'
+
+        if self.F_squant == 't' : n += '_squant'
+        else: n += '_nsquant'
+
+        if self.F_pagedkv == 't' : n += '_pagedkv'
+        else: n += '_npagedkv'
+
+        return n
+
+class FmhaFwdApiPool:
+    def __init__(self, mask_impl):
+        self.pool = dict()
+        self.mask_impl = mask_impl
+
+    def register_traits(self, trait : FmhaFwdApiTrait) -> None:
+        # TODO: do we need to check duplication?
+        if trait.dtype not in self.pool.keys():
+            self.pool[trait.dtype] = dict()
+        if trait.hdim not in self.pool[trait.dtype].keys():
+            self.pool[trait.dtype][trait.hdim] = list()
+
+        self.pool[trait.dtype][trait.hdim].append(copy.copy(trait))
+
+    @property
+    def api(self) -> str:
+        per_dtypes=str()
+        for i, dtype in enumerate(self.pool.keys()):
+            per_hdim_case=str()
+            for j, hdim in enumerate(self.pool[dtype].keys()):
+                traits=self.pool[dtype][hdim]
+                inners=str()
+                for k, trait in enumerate(traits):
+                    if_k = 'if' if k == 0 else 'else if'
+                    inners = inners + FMHA_FWD_API_INNER_DISPATCH.format(F_if=if_k, F_mode=MODE_MAP[trait.mode], F_vlayout=LAYOUT_MAP[trait.vlayout],
+                                   F_pipeline_enum=PIPELINE_ENUM_MAP[trait.pipeline_tag], F_logits=BOOL_MAP[trait.logits], F_mask=get_mask_map(self.mask_impl)[trait.mask],
+                                   F_mask_check=get_mask_check_map(self.mask_impl)[trait.mask], F_bias_check=BIAS_CHECK_MAP[trait.bias], F_bias=BIAS_MAP[trait.bias],
+                                   F_lse=BOOL_MAP[trait.lse], F_pagedkv=BOOL_MAP[trait.pagedkv], F_skip=BOOL_MAP[trait.skip],
+                                   F_squant=BOOL_MAP[trait.squant], F_scheck=trait.scheck, F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck,
+                                   F_spad=BOOL_MAP[trait.spad], F_skpad=BOOL_MAP[trait.skpad], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad],
+                                   F_bm0=trait.bm0, F_bn0=trait.bn0, F_bk0=trait.bk0, F_bn1=trait.bn1, F_bk1=trait.bk1, F_bk0max=trait.bk0max,
+                                   F_hdim=hdim, F_dtype=FWD_DTYPE_MAP[dtype])
+                if_j = 'if' if j == 0 else 'else if'
+                per_hdim_case = per_hdim_case + FMHA_FWD_API_PER_HDIM_CASE.format(F_if=if_j, F_hdim=hdim, F_hdim_v=trait.bn1, F_inner_dispatch=inners)
+            if_i = 'if' if i == 0 else 'else if'
+            per_dtypes = per_dtypes + FMHA_FWD_API_PER_DTYPE.format(F_if=if_i, F_dtype=dtype, F_hdim_case=per_hdim_case)
+        if not per_dtypes:
+            # empty string we add some ignore to suppress warning in api
+            per_dtypes += '    (void)t ; (void)s ; (void)a;'
+        return FMHA_FWD_KERNEL_HEADER + FMHA_FWD_API.format(F_dispatch = per_dtypes)
+
+@dataclass
+class FmhaFwdTileSize:
+    F_bm0       : int  # tile size along q seqlen (block size)
+    F_bn0       : int  # tile size along k seqlen
+    F_bk0       : int  # tile size along qk gemm unroll
+    F_bn1       : int  # tile size along v head_dim
+    F_bk1       : int  # tile size along kv gemm unroll
+    F_bk0max    : int  # total length of K0, used for pipeline that need load Q at once (or repeately load Q as a whole tile)
+    F_rm0       : int  # number of warps for gemm0 along q seqlen
+    F_rn0       : int  # number of warps for gemm0 along k seqlen
+    F_rk0       : int  # number of warps for gemm0 along head dim q (not used)
+    F_rm1       : int  # number of warps for gemm1 along q seqlen
+    F_rn1       : int  # number of warps for gemm1 along head dim v
+    F_rk1       : int  # number of warps for gemm1 along k seqlen (not used)
+    F_wm0       : int  # gemm0 warp size along m
+    F_wn0       : int  # gemm0 warp size along n
+    F_wk0       : int  # gemm0 warp size along k
+    F_wm1       : int  # gemm1 warp size along m
+    F_wn1       : int  # gemm1 warp size along n
+    F_wk1       : int  # gemm1 warp size along k
+    F_occupancy : int  # occupancy, -1 will let pipeline decide the occupancy, other value will overwrite occupancy
+    @property
+    def name(self) -> str:
+        return f"b{self.F_bm0}x{self.F_bn0}x{self.F_bk0}x{self.F_bn1}x{self.F_bk1}x{self.F_bk0max}" +\
+        f"_r{self.F_rm0}x{self.F_rn0}x{self.F_rk0}_r{self.F_rm1}x{self.F_rn1}x{self.F_rk1}" +\
+        f"_w{self.F_wm0}x{self.F_wn0}x{self.F_wk0}_w{self.F_wm1}x{self.F_wn1}x{self.F_wk1}" +\
+        ("" if self.F_occupancy == -1 else f"_o{self.F_occupancy}")
+
+@dataclass
+class FmhaFwdKernel:
+    F_idx           : int  # this is not a tunable, but a counter to differentiate symbol
+    F_hdim          : int  # hdim
+    F_dtype         : str  # data type
+    F_mode          : str  # value from MODE_MAP
+    F_tile          : FmhaFwdTileSize
+    F_pipeline      : FmhaFwdPipeline
+    mask_impl       : str
+
+    @property
+    def template(self) -> str:
+        kernel_body = str()
+        return FMHA_FWD_KERNEL_HEADER + \
+            FMHA_FWD_KERNEL_BODY.format(
+                F_idx           = self.F_idx,
+                F_hdim          = self.F_hdim,
+                F_dtype         = FWD_DTYPE_MAP[self.F_dtype],
+                F_bm0           = self.F_tile.F_bm0,
+                F_bn0           = self.F_tile.F_bn0,
+                F_bk0           = self.F_tile.F_bk0,
+                F_bn1           = self.F_tile.F_bn1,
+                F_bk1           = self.F_tile.F_bk1,
+                F_bk0max        = self.F_tile.F_bk0max,
+                F_rm0           = self.F_tile.F_rm0,
+                F_rn0           = self.F_tile.F_rn0,
+                F_rk0           = self.F_tile.F_rk0,
+                F_rm1           = self.F_tile.F_rm1,
+                F_rn1           = self.F_tile.F_rn1,
+                F_rk1           = self.F_tile.F_rk1,
+                F_wm0           = self.F_tile.F_wm0,
+                F_wn0           = self.F_tile.F_wn0,
+                F_wk0           = self.F_tile.F_wk0,
+                F_wm1           = self.F_tile.F_wm1,
+                F_wn1           = self.F_tile.F_wn1,
+                F_wk1           = self.F_tile.F_wk1,
+                F_vlayout       = LAYOUT_MAP[self.F_pipeline.F_vlayout],
+                F_spad          = BOOL_MAP[self.F_pipeline.F_spad],
+                F_skpad         = BOOL_MAP[self.F_pipeline.F_skpad],
+                F_dpad          = BOOL_MAP[self.F_pipeline.F_dpad],
+                F_dvpad         = BOOL_MAP[self.F_pipeline.F_dvpad],
+                F_logits        = BOOL_MAP[self.F_pipeline.F_logits],
+                F_bias          = BIAS_MAP[self.F_pipeline.F_bias],
+                F_lse           = BOOL_MAP[self.F_pipeline.F_lse],
+                F_pagedkv       = BOOL_MAP[self.F_pipeline.F_pagedkv],
+                F_squant        = BOOL_MAP[self.F_pipeline.F_squant],
+                F_skip          = BOOL_MAP[self.F_pipeline.F_skip],
+                F_occupancy     = self.F_tile.F_occupancy,
+                F_pipeline_enum = PIPELINE_ENUM_MAP[self.F_pipeline.tag],
+                F_mask          = get_mask_map(self.mask_impl)[self.F_pipeline.F_mask],
+                F_mode          = MODE_MAP[self.F_mode],
+                F_pipeline      = FMHA_FWD_PAGEDKV_PIPELINE_MAP[self.F_pipeline.tag])
+
+    @property
+    def name(self) -> str:
+        # TODO: we don't encode idx here
+        return f"fmha_fwd_pagedkv_d{self.F_hdim}_{self.F_dtype}_{self.F_mode}_" + \
+                self.F_tile.name + '_' + self.F_pipeline.name
+
+    @property
+    def filename(self) -> str:
+        return self.name + ".cpp"
+
+    def api_trait(self) -> FmhaFwdApiTrait:
+        return FmhaFwdApiTrait(
+                pipeline_tag=self.F_pipeline.tag,
+                hdim=str(self.F_hdim),
+                dtype=self.F_dtype,
+                mode=self.F_mode,
+                bm0=self.F_tile.F_bm0,
+                bn0=self.F_tile.F_bn0,
+                bk0=self.F_tile.F_bk0,
+                bn1=self.F_tile.F_bn1,
+                bk1=self.F_tile.F_bk1,
+                bk0max=self.F_tile.F_bk0max,
+                vlayout=self.F_pipeline.F_vlayout,
+                mask=self.F_pipeline.F_mask,
+                logits=self.F_pipeline.F_logits,
+                bias=self.F_pipeline.F_bias,
+                lse=self.F_pipeline.F_lse,
+                pagedkv=self.F_pipeline.F_pagedkv,
+                squant=self.F_pipeline.F_squant,
+                spad=self.F_pipeline.F_spad,
+                skpad=self.F_pipeline.F_skpad,
+                dpad=self.F_pipeline.F_dpad,
+                dvpad=self.F_pipeline.F_dvpad,
+                skip=self.F_pipeline.F_skip)
+
+# TODO: design a more practical way to do it
+# this is current supported tile size per hdim
+def get_fmha_fwd_tile_dict_from_dtype(dtype : str) -> Optional[dict]:
+    if dtype == 'fp16' or dtype == 'bf16':
+        return {
+            # '32'  : FmhaFwdTileSize(128, 64,  16, 32,  32,  32,   2, 1, 1,  2, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
+            # '64'  : FmhaFwdTileSize(128, 64,  32, 64,  32,  64,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
+        ### '96'  : FmhaFwdTileSize(128, 128, 32, 128, 32,  96,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
+            '128' : FmhaFwdTileSize(128, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
+            # '192' : FmhaFwdTileSize(128, 128, 32, 128, 32,  192,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
+            # '256' : FmhaFwdTileSize(128, 128, 32, 256, 32,  256,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
+        }
+    elif dtype == 'fp8' or dtype == 'bf8':
+        return {
+            '64'  : FmhaFwdTileSize(128, 64,  32, 64,  32,  64,   2, 1, 1,  2, 1, 1,  32, 32, 32,  32, 32, 32,  -1),
+            '128' : FmhaFwdTileSize(128, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  32, 32, 32,  32, 32, 32,  -1),
+            '256' : FmhaFwdTileSize(128, 128, 32, 256, 32,  256,  4, 1, 1,  4, 1, 1,  32, 32, 32,  32, 32, 32,  -1),
+        }
+    else:
+        return None
+
+def get_fwd_blobs(kernel_filter : Optional[str], receipt, optdim_list, mask_impl) -> Tuple[FmhaFwdApiPool, List[FmhaFwdKernel]]:
+    # TODO: we don't support tuning yet, so pick up one value for vlayout/pipeline/pad
+    #       support this in future
+    def get_pipelines(dtype, hdim) -> List[FmhaFwdPipeline]:
+        # this function will populate a list possible pipelines
+        # TODO: the order of List matters! the later in this list will be also be checked later
+        # TODO: currently for qr_pagedkv pipeline, let 't' padding to appear later!!
+        # TODO: how to design this more generic?
+        squant = 't' if dtype == 'fp8' else 'f'
+        pipelines = []
+        if dtype in ['fp16', 'bf16']:
+            for logits, mask, bias,  pagedkv, skip in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys(),  ["t", "f"], ["t", "f"]):
+                pipelines.append(FmhaFwdPipeline('qr_pagedkv', 'col', 't', 'f', 'f', 'f', logits, bias, 'f',  pagedkv, squant, mask, skip))
+                pipelines.append(FmhaFwdPipeline('qr_pagedkv', 'col', 't', 't', 'f', 'f', logits, bias, 'f',  pagedkv, squant, mask, skip))
+                pipelines.append(FmhaFwdPipeline('qr_pagedkv', 'row', 't', 'f', 'f', 'f', logits, bias, 'f',  pagedkv, squant, mask, skip))
+                pipelines.append(FmhaFwdPipeline('qr_pagedkv', 'row', 't', 't', 'f', 'f', logits, bias, 'f',  pagedkv, squant, mask, skip))
+        elif dtype in ['fp8', 'bf8']:
+            # TODO
+            None
+        elif dtype in ['fp8fp16', 'fp8bf16']:
+            # TODO
+            None
+        else:
+            assert False
+        return pipelines
+
+    gen = list()
+    api_pool = FmhaFwdApiPool(mask_impl)
+
+    for dtype in FWD_DTYPE_MAP.keys():
+        d = get_fmha_fwd_tile_dict_from_dtype(dtype)
+        if d == None:
+            continue
+        #for hdim_str, mode, mask, bias, lse in itertools.product(d.keys(), MODE_MAP.keys(), MASK_MAP.keys(), ["t", "f"], ["t", "f"]):
+        for hdim_str, mode in itertools.product(d.keys(), MODE_MAP.keys()):
+            tile = d[hdim_str]
+            hdim = int(hdim_str)
+            for pipeline in get_pipelines(dtype, hdim):
+                # if pipeline.F_pagedkv == 'f':
+                #     continue
+                if mode == "group":
+                    if pipeline.F_spad != 't' or pipeline.F_skpad != 't':
+                        # in group mode, spad/skpad must be true, since we can't predict if seqlen of current batch need pad or not
+                        continue
+                if hdim == 192 and tile.F_bn1 == 128:
+                    # NOTE: this is used to speedup deepseek prefill case, we don't gen training
+                    if pipeline.F_bias != 'no' or pipeline.F_lse == 't' :
+                        continue
+                # logits_soft_cap is only allowed if no bias
+                if not ((pipeline.F_logits == 't' and pipeline.F_bias == 'no') or pipeline.F_logits == 'f'):
+                    continue
+                k = FmhaFwdKernel(F_idx=0,
+                                  F_hdim=hdim,
+                                  F_dtype=dtype,
+                                  F_mode=mode,
+                                  F_tile=tile,
+                                  F_pipeline=pipeline,
+                                  mask_impl=mask_impl)
+                if kernel_filter != '':
+                    if not fnmatch.fnmatch(k.name, kernel_filter):
+                        continue
+                if optdim_list != [-1]:
+                    if hdim not in optdim_list:
+                        continue
+                # 2 - Flash attention integration
+                if receipt in (2, 3):
+                    cond = dtype in ['fp16', 'bf16']
+                    cond &= pipeline.F_vlayout == 'row'
+                    cond &= pipeline.F_bias in ['no', 'alibi']
+                    cond &= pipeline.F_squant == 'f'
+                    cond &= pipeline.F_skip == 'f'
+                    if not cond:
+                        continue
+                # PyTorch integration
+                elif receipt == 4:
+                    cond = dtype in ['fp16', 'bf16']
+                    cond &= pipeline.F_vlayout == 'row'
+                    cond &= pipeline.F_bias in ['no', 'bias']
+                    cond &= pipeline.F_squant == 'f'
+                    cond &= pipeline.F_skip == 'f'
+                    if not cond:
+                        continue
+                # Aiter(mha_fwd) integration
+                elif receipt == 100:
+                    cond = dtype in ['fp16', 'bf16']
+                    cond &= mode == 'batch'
+                    cond &= pipeline.F_vlayout == 'row'
+                    cond &= pipeline.F_squant == 'f'
+                    if not cond:
+                        continue
+                # Aiter(mha_varlen_fwd) integration
+                elif receipt == 200:
+                    cond = dtype in ['fp16', 'bf16']
+                    cond &= mode == 'group'
+                    cond &= pipeline.F_vlayout == 'row'
+                    cond &= pipeline.F_squant == 'f'
+                    if not cond:
+                        continue
+                # aiter::mha_fwd C++ api integration
+                elif receipt == 600:
+                    cond = dtype in ['fp16', 'bf16']
+                    cond &= pipeline.F_vlayout == 'row'
+                    cond &= pipeline.F_squant == 'f'
+                    if not cond:
+                        continue
+
+                api_pool.register_traits(k.api_trait())
+                gen.append(k)
+
+    return (api_pool, gen)
+
+def write_single_fwd_kernel(kernel: FmhaFwdKernel, autogen_dir: Path) -> None:
+    (autogen_dir / kernel.filename).write_text(kernel.template)
+
+def write_fwd_api(api_pool : FmhaFwdApiPool, autogen_dir: Path) -> None:
+    (autogen_dir / FMHA_FWD_API_FILENAME).write_text(api_pool.api)
+
+def write_blobs(output_dir : Path, kernel_filter : str, receipt, optdim_list, mask_impl) -> None:
+    api_pool, kernels = get_fwd_blobs(kernel_filter, receipt, optdim_list, mask_impl)
+    for kernel in kernels:
+        write_single_fwd_kernel(kernel, output_dir)
+    write_fwd_api(api_pool, output_dir)
+
+def list_blobs(file_path : Path, kernel_filter : str, receipt, optdim_list, mask_impl) -> None:
+    with file_path.open('a') as f:
+        _, kernels = get_fwd_blobs(kernel_filter, receipt, optdim_list, mask_impl)
+        for kernel in kernels:
+            f.write(str(file_path.parent / GEN_DIR / kernel.filename) + "\n")
+        f.write(str(file_path.parent / GEN_DIR / FMHA_FWD_API_FILENAME) + "\n")
--- a/example/ck_tile/01_fmha/codegen/utils.py
+++ b/example/ck_tile/01_fmha/codegen/utils.py
@@ -0,0 +1,21 @@
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+# generate kernel instances to speed up compilation
+
+import os.path as path
+
+
+def update_file(file_path, content):
+    """Update the file at file_path with the given content if it differs from the existing content.
+
+    It avoids unnecessary touching of the file which triggers rebuilds
+    """
+
+    existing_content = ""
+    if path.exists(file_path):
+        with open(file_path, "r") as file:
+            existing_content = file.read()
+    if existing_content == content:
+        return
+    with open(file_path, "w") as file:
+        file.write(content)
--- a/example/ck_tile/01_fmha/fmha_bwd.cpp
+++ b/example/ck_tile/01_fmha/fmha_bwd.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "fmha_bwd.hpp"
 #include "ck_tile/host.hpp"
@@ -355,7 +355,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
    if(bias.type == bias_enum::alibi)
    {
        auto slopes = ck_tile::get_alibi_slopes<AccDataType>(nhead);
-        assert(slopes.size() == nhead);
+        assert(slopes.size() == static_cast<decltype(slopes.size())>(nhead));
        if(bias.rank_info == 0)
        {
            // alibi in 1*h
@@ -756,22 +756,17 @@ bool run(const ck_tile::ArgParser& arg_parser)

        if(p_drop > 0)
        {
-            p_hp_host_ref.ForEach(
-                [&](auto& self, auto idx) { p_dropped_hp_host_ref(idx) = self(idx); });
+            p_dropped_hp_host_ref = p_hp_host_ref;
            randval_host_ref.ForEach([&](auto& self, auto idx) {
                self(idx) = randval_host(b, idx[0], idx[1] + query_offset, idx[2]);
            });
            ck_tile::reference_batched_dropout(
                p_dropped_hp_host_ref, randval_host_ref, p_undrop_in_uint8_t, rp_undrop);
-            p_dropped_hp_host_ref.ForEach([&](auto& self, auto idx) {
-                p_lp_host_ref(idx) = ck_tile::type_convert<GemmDataType>(self(idx));
-            });
+            p_lp_host_ref = p_dropped_hp_host_ref.template CopyAsType<GemmDataType>();
        }
        else
        {
-            p_hp_host_ref.ForEach([&](auto& self, auto idx) {
-                p_lp_host_ref(idx) = ck_tile::type_convert<GemmDataType>(self(idx));
-            });
+            p_lp_host_ref = p_hp_host_ref.template CopyAsType<GemmDataType>();
        }

        // O = P * V
@@ -798,6 +793,14 @@ bool run(const ck_tile::ArgParser& arg_parser)
        }
    }

+    // set to bad values to check if the kernel writes to these buffers
+    ck_tile::FillConstant<QGradDataType>{ck_tile::numeric<QGradDataType>::infinity()}(dq_host);
+    ck_tile::FillConstant<KGradDataType>{ck_tile::numeric<KGradDataType>::infinity()}(dk_host);
+    ck_tile::FillConstant<VGradDataType>{ck_tile::numeric<VGradDataType>::infinity()}(dv_host);
+    dq_buf.ToDevice(dq_host.data());
+    dk_buf.ToDevice(dk_host.data());
+    dv_buf.ToDevice(dv_host.data());
+
    o_buf.ToDevice(o_host.data());
    lse_buf.ToDevice(lse_host.data());
    dq_buf.SetZero();
@@ -806,6 +809,20 @@ bool run(const ck_tile::ArgParser& arg_parser)

    ck_tile::stream_config stream_config_v{
        nullptr, true, 0, 0, 1, arg_parser.get_str("timer") == std::string("gpu")};
+
+    printf("\nfmha_bwd_traits: hdim_q=%d, hdim_v=%d, data_type=%s, is_group_mode=%d, mask_type=%d, "
+           "bias_type=%d, has_dbias=%d, has_dropout=%d, is_store_randval=%d, is_deterministic=%d\n",
+           fmha_traits.hdim_q,
+           fmha_traits.hdim_v,
+           fmha_traits.data_type.c_str(),
+           fmha_traits.is_group_mode,
+           static_cast<int>(fmha_traits.mask_type),
+           static_cast<int>(fmha_traits.bias_type),
+           fmha_traits.has_dbias,
+           fmha_traits.has_dropout,
+           fmha_traits.is_store_randval,
+           fmha_traits.is_deterministic);
+    fflush(stdout);
    fmha_bwd(fmha_traits, fmha_args, stream_config_v);

    dq_buf.FromDevice(dq_host.data());
@@ -854,29 +871,27 @@ bool run(const ck_tile::ArgParser& arg_parser)
        }

        // dS_i_j = P_i_j .* (dP_i_j - dO_i dot O_i)
-        ds_hp_host_ref.ForEach([&](auto& self, auto idx_gmn) {
-            AccDataType do_dot_o = 0;
-            for(int o = 0; o < hdim_v; o++)
-            {
-                auto idx_gmo = idx_gmn;
-                idx_gmo[2]   = o;
-                do_dot_o += ck_tile::type_convert<AccDataType>(do_host_ref(idx_gmo)) *
-                            ck_tile::type_convert<AccDataType>(o_host_refs[wb](idx_gmo));
-            }
-            self(idx_gmn) = ck_tile::type_convert<AccDataType>(
-                p_hp_host_refs[wb](idx_gmn) * (dp_hp_host_ref(idx_gmn) - do_dot_o));
-        });
+        ck_tile::make_ParallelTensorFunctor(
+            [&](auto i0, auto i1, auto i2) {
+                AccDataType do_dot_o = 0;
+                for(int o = 0; o < hdim_v; o++)
+                {
+                    do_dot_o += ck_tile::type_convert<AccDataType>(do_host_ref(i0, i1, o)) *
+                                ck_tile::type_convert<AccDataType>(o_host_refs[wb](i0, i1, o));
+                }
+                ds_hp_host_ref(i0, i1, i2) = ck_tile::type_convert<AccDataType>(
+                    p_hp_host_refs[wb](i0, i1, i2) * (dp_hp_host_ref(i0, i1, i2) - do_dot_o));
+            },
+            ds_hp_host_ref.mDesc.get_lengths()[0],
+            ds_hp_host_ref.mDesc.get_lengths()[1],
+            ds_hp_host_ref.mDesc.get_lengths()[2])(std::thread::hardware_concurrency());

        if(use_dbias)
        {
-            ds_hp_host_ref.ForEach([&](auto& self, auto idx) {
-                dbias_host_ref(idx) = ck_tile::type_convert<BiasGradDataType>(self(idx));
-            });
+            dbias_host_ref = ds_hp_host_ref.template CopyAsType<BiasGradDataType>();
        }

-        ds_hp_host_ref.ForEach([&](auto& self, auto idx) {
-            ds_lp_host_ref(idx) = ck_tile::type_convert<GemmDataType>(self(idx));
-        });
+        ds_lp_host_ref = ds_hp_host_ref.template CopyAsType<GemmDataType>();

        // dV = P_drop^T@dO^T
        // dV = P^T@dO^T w/o dropout
--- a/example/ck_tile/01_fmha/fmha_bwd.hpp
+++ b/example/ck_tile/01_fmha/fmha_bwd.hpp
@@ -1,9 +1,10 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

 #include "ck_tile/core.hpp"
+#include "ck_tile/host/device_prop.hpp"
 #include "ck_tile/host/kernel_launch.hpp"
 #include "ck_tile/ops/fmha.hpp"
 #include "ck_tile/ops/epilogue.hpp"
@@ -155,6 +156,12 @@ auto fmha_bwd_dq_dk_dv_create_kargs_and_grids(fmha_bwd_args args)
 {
    assert(args.nhead_q % args.nhead_k == 0);
    auto kargs = [&] {
+        constexpr bool dq_uss_acc  = FmhaBwdDQDKDVKernel::kMaxSeqLenQ == 0;
+        const auto dq_ptr          = dq_uss_acc ? args.dq_acc_ptr : args.dq_ptr;
+        const auto stride_dq       = dq_uss_acc ? args.stride_dq_acc : args.stride_dq;
+        const auto nhead_stride_dq = dq_uss_acc ? args.nhead_stride_dq_acc : args.nhead_stride_dq;
+        const auto batch_stride_dq = dq_uss_acc ? args.batch_stride_dq_acc : args.batch_stride_dq;
+
        // create group mode kernel arguments
        if constexpr(FmhaBwdDQDKDVKernel::kIsGroupMode)
        {
@@ -169,7 +176,7 @@ auto fmha_bwd_dq_dk_dv_create_kargs_and_grids(fmha_bwd_args args)
                                                      args.dk_ptr,
                                                      args.dv_ptr,
                                                      args.dbias_ptr,
-                                                      args.dq_acc_ptr,
+                                                      dq_ptr,
                                                      args.seqstart_q_ptr,
                                                      args.seqstart_k_ptr,
                                                      args.seqlen_k_ptr,
@@ -184,7 +191,7 @@ auto fmha_bwd_dq_dk_dv_create_kargs_and_grids(fmha_bwd_args args)
                                                      args.stride_bias,
                                                      args.stride_randval,
                                                      args.stride_do,
-                                                      args.stride_dq_acc,
+                                                      stride_dq,
                                                      args.stride_dk,
                                                      args.stride_dv,
                                                      args.stride_dbias,
@@ -195,7 +202,7 @@ auto fmha_bwd_dq_dk_dv_create_kargs_and_grids(fmha_bwd_args args)
                                                      args.nhead_stride_randval,
                                                      args.nhead_stride_do,
                                                      args.nhead_stride_lsed,
-                                                      args.nhead_stride_dq_acc,
+                                                      nhead_stride_dq,
                                                      args.nhead_stride_dk,
                                                      args.nhead_stride_dv,
                                                      args.nhead_stride_dbias,
@@ -219,7 +226,7 @@ auto fmha_bwd_dq_dk_dv_create_kargs_and_grids(fmha_bwd_args args)
                                                      args.dk_ptr,
                                                      args.dv_ptr,
                                                      args.dbias_ptr,
-                                                      args.dq_acc_ptr,
+                                                      dq_ptr,
                                                      args.seqlen_q,
                                                      args.seqlen_k,
                                                      args.hdim_q,
@@ -233,7 +240,7 @@ auto fmha_bwd_dq_dk_dv_create_kargs_and_grids(fmha_bwd_args args)
                                                      args.stride_bias,
                                                      args.stride_randval,
                                                      args.stride_do,
-                                                      args.stride_dq_acc,
+                                                      stride_dq,
                                                      args.stride_dk,
                                                      args.stride_dv,
                                                      args.stride_dbias,
@@ -244,7 +251,7 @@ auto fmha_bwd_dq_dk_dv_create_kargs_and_grids(fmha_bwd_args args)
                                                      args.nhead_stride_randval,
                                                      args.nhead_stride_do,
                                                      args.nhead_stride_lsed,
-                                                      args.nhead_stride_dq_acc,
+                                                      nhead_stride_dq,
                                                      args.nhead_stride_dk,
                                                      args.nhead_stride_dv,
                                                      args.nhead_stride_dbias,
@@ -255,7 +262,7 @@ auto fmha_bwd_dq_dk_dv_create_kargs_and_grids(fmha_bwd_args args)
                                                      args.batch_stride_randval,
                                                      args.batch_stride_do,
                                                      args.batch_stride_lsed,
-                                                      args.batch_stride_dq_acc,
+                                                      batch_stride_dq,
                                                      args.batch_stride_dk,
                                                      args.batch_stride_dv,
                                                      args.batch_stride_dbias,
@@ -357,31 +364,17 @@ auto fmha_bwd_convert_dq_create_kargs_and_grids(fmha_bwd_args args)
 template <ck_tile::index_t HDim_,
          typename DataType_,
          bool kIsGroupMode_,
-          ck_tile::BlockFmhaBwdPipelineEnum FmhaBwdPipelineEnum_,
          typename FmhaMask_,
          typename FmhaDropout_,
          ck_tile::BlockAttentionBiasEnum BiasEnum_,
          bool kHasBiasGrad_,
-          bool kPadS_,
-          bool kPadSK_,
          bool kPadD_,
          bool kPadDv_,
-          bool kIsDeterministic_>
+          bool kIsDeterministic_,
+          bool kUseTrLoad_,
+          ck_tile::index_t MaxSeqLenQ_>
 struct fmha_bwd_dq_dk_dv_traits_
 {
-    static constexpr ck_tile::index_t HDim    = HDim_;
-    using DataType                            = ck_tile::remove_cvref_t<DataType_>;
-    static constexpr bool kIsGroupMode        = kIsGroupMode_;
-    static constexpr auto FmhaBwdPipelineEnum = FmhaBwdPipelineEnum_;
-    using FmhaMask                            = ck_tile::remove_cvref_t<FmhaMask_>;
-    using FmhaDropout                         = ck_tile::remove_cvref_t<FmhaDropout_>;
-    static constexpr auto BiasEnum            = BiasEnum_;
-    static constexpr bool kHasBiasGrad        = kHasBiasGrad_;
-    static constexpr bool kPadS               = kPadS_;
-    static constexpr bool kPadSK              = kPadSK_;
-    static constexpr bool kPadD               = kPadD_;
-    static constexpr bool kPadDv              = kPadDv_;
-    static constexpr bool kIsDeterministic    = kIsDeterministic_;
 };

 template <typename Traits_>
@@ -392,6 +385,8 @@ void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config&, fmha_bwd_args);

 template <typename Traits_>
 std::string fmha_bwd_dq_dk_dv_get_name_();
+template <typename Traits_>
+int fmha_bwd_dq_dk_dv_maxq_();

 template <ck_tile::index_t HDim_, typename DataType_, bool kIsGroupMode_, bool kPadS_, bool kPadDv_>
 struct fmha_bwd_dot_do_o_traits_
--- a/example/ck_tile/01_fmha/fmha_fwd.cpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "fmha_fwd.hpp"
 #include "ck_tile/host.hpp"
@@ -11,6 +11,7 @@
 #include <array>
 #include <cstring>
 #include <functional>
+#include <cmath>
 #include <numeric>
 #include <ostream>
 #include <string>
@@ -72,6 +73,7 @@ auto create_args(int argc, char* argv[])
                "0",
                "scale factor of S. 0 means equal to 1/sqrt(hdim).\n"
                "note when squant=1, this value will be modified by range_q/k")
+        .insert("logits_soft_cap", "0", "attention logits soft capping value.")
        .insert("range_q", "16", "per-tensor quantization range of q. used if squant=1.")
        .insert("range_k", "16", "per-tensor quantization range of k. used if squant=1.")
        .insert("range_v", "16", "per-tensor quantization range of v. used if squant=1.")
@@ -176,50 +178,30 @@ auto get_elimit<FmhaFwdFp8>(std::string init_method)
    }
 }

-int num_splits_heuristic(int batch_nhead_mblocks, int num_SMs, int num_n_blocks, int max_splits)
+int num_splits_heuristic(int batch_nhead_mblocks, int num_SMs, int max_splits)
 {
    // If we have enough to almost fill the SMs, then just use 1 split
    if(batch_nhead_mblocks >= 0.8f * num_SMs)
    {
        return 1;
    }
-    max_splits           = std::min({max_splits, num_SMs, num_n_blocks});
+    max_splits           = std::min({max_splits, num_SMs});
    float max_efficiency = 0.f;
    std::vector<float> efficiency;
    efficiency.reserve(max_splits);
-    auto ceildiv = [](int a, int b) { return (a + b - 1) / b; };
-    // Some splits are not eligible. For example, if we have 64 blocks and choose 11 splits,
-    // we'll have 6 * 10 + 4 blocks. If we choose 12 splits, we'll have 6 * 11 + (-2) blocks
-    // (i.e. it's 11 splits anyway).
-    // So we check if the number of blocks per split is the same as the previous num_splits.
-    auto is_split_eligible = [&ceildiv, &num_n_blocks](int num_splits) {
-        return num_splits == 1 ||
-               ceildiv(num_n_blocks, num_splits) != ceildiv(num_n_blocks, num_splits - 1);
-    };
    for(int num_splits = 1; num_splits <= max_splits; num_splits++)
    {
-        if(!is_split_eligible(num_splits))
+        float n_waves = float(batch_nhead_mblocks * num_splits) / num_SMs;
+        float eff     = n_waves / ceil(n_waves);
+        // printf("num_splits = %d, eff = %f\n", num_splits, eff);
+        if(eff > max_efficiency)
        {
-            efficiency.push_back(0.f);
-        }
-        else
-        {
-            float n_waves = float(batch_nhead_mblocks * num_splits) / num_SMs;
-            float eff     = n_waves / ceil(n_waves);
-            // printf("num_splits = %d, eff = %f\n", num_splits, eff);
-            if(eff > max_efficiency)
-            {
-                max_efficiency = eff;
-            }
-            efficiency.push_back(eff);
+            max_efficiency = eff;
        }
+        efficiency.push_back(eff);
    }
    for(int num_splits = 1; num_splits <= max_splits; num_splits++)
    {
-        if(!is_split_eligible(num_splits))
-        {
-            continue;
-        }
        if(efficiency[num_splits - 1] >= 0.85 * max_efficiency)
        {
            // printf("num_splits chosen = %d\n", num_splits);
@@ -232,6 +214,7 @@ int num_splits_heuristic(int batch_nhead_mblocks, int num_SMs, int num_n_blocks,
 int override_num_splits_if_necessary(
    int batch, int nhead, int max_seqlen_q, int hdim_v, float p_drop, int num_splits)
 {
+    (void)hdim_v;
    int device;
    auto status = hipGetDevice(&device);
    if(status != hipSuccess)
@@ -248,15 +231,13 @@ int override_num_splits_if_necessary(

    // tile size should match the generate.py
    const int kM0 = 64;
-    const int kN1 = hdim_v;

    const int num_m_blocks = ck_tile::integer_divide_ceil(max_seqlen_q, kM0);
-    const int num_n_blocks = ck_tile::integer_divide_ceil(hdim_v, kN1);

    if(num_splits < 1 && p_drop == 0.0f)
    {
        return num_splits_heuristic(
-            batch * nhead * num_m_blocks, props.multiProcessorCount * 2, num_n_blocks, 128);
+            batch * nhead * num_m_blocks, props.multiProcessorCount * 2, 128);
    }

    return num_splits;
@@ -342,7 +323,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
    }

    ck_tile::index_t page_block_size = arg_parser.get_int("page_block_size");
-#if !CK_TILE_FMHA_FWD_APPENDKV_API && !CK_TILE_FMHA_FWD_SPLITKV_API
+#if(!(CK_TILE_FMHA_FWD_APPENDKV_API || CK_TILE_FMHA_FWD_SPLITKV_API || \
+      CK_TILE_FMHA_FWD_PAGEDKV_API))
    if(0 < page_block_size)
    {
        std::cerr << "paged-kvcache is not supported. ignoring the 'page_block_size' option"
@@ -358,7 +340,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
    }

    bool use_cache_batch_idx = arg_parser.get_bool("cache_batch_idx");
-#if !CK_TILE_FMHA_FWD_APPENDKV_API && !CK_TILE_FMHA_FWD_SPLITKV_API
+#if !(CK_TILE_FMHA_FWD_APPENDKV_API || CK_TILE_FMHA_FWD_SPLITKV_API || CK_TILE_FMHA_FWD_PAGEDKV_API)
    if(use_cache_batch_idx)
    {
        std::cerr << "split-kv is not supported. ignoring the 'cache_batch_idx' option"
@@ -416,6 +398,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
    if(scale_s == .0f)
        scale_s = 1.0 / ck_tile::sqrt(static_cast<float>(hdim_q)); // TODO: q ? v ?

+    const float logits_soft_cap = arg_parser.get_float("logits_soft_cap");
+
    std::string squant_str = arg_parser.get_str("squant");
    bool squant            = [&]() {
        if(squant_str == "auto")
@@ -538,13 +522,13 @@ bool run(const ck_tile::ArgParser& arg_parser)
                max_seqlen_k = real_seqlen_k;
            }

-            flop += nhead * (static_cast<std::size_t>(2) * real_seqlen_q * real_seqlen_k * hdim_q +
-                             static_cast<std::size_t>(2) * real_seqlen_q * hdim_v * real_seqlen_k);
+            flop += nhead * (static_cast<std::size_t>(2) * mask.get_unmaskarea() * hdim_q +
+                             static_cast<std::size_t>(2) * mask.get_unmaskarea() * hdim_v);

            num_byte += nhead * (sizeof(QDataType) * real_seqlen_q * hdim_q +
-                                 sizeof(KDataType) * real_seqlen_k * hdim_q +
-                                 sizeof(VDataType) * hdim_v * real_seqlen_k +
                                 sizeof(ODataType) * real_seqlen_q * hdim_v);
+            num_byte += nhead_k * (sizeof(KDataType) * real_seqlen_k * hdim_q +
+                                   sizeof(VDataType) * hdim_v * real_seqlen_k);
        }
    }

@@ -564,7 +548,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
        std::cerr << "num_splits greater than 128 is not supported" << std::endl;
        return false;
    }
-#if CK_TILE_FMHA_FWD_SPLITKV_API
+#if CK_TILE_FMHA_FWD_SPLITKV_API || CK_TILE_FMHA_FWD_PAGEDKV_API
    if(0 < p_drop && (1 < num_splits || use_kvcache))
    {
        std::cerr << "dropout is not supoprted by split-kv kernels. ignoring the 'p_drop' option"
@@ -620,7 +604,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
            : std::array<ck_tile::index_t, 4>{1, 1, 1, 1} /* dummy shape for simplifying code */);
    ck_tile::HostTensor<BiasDataType> bias_host(
        bias.type == bias_enum::elementwise_bias
-            ? get_lengths(i_perm, 1, 1, shape_seqlen_q, shape_seqlen_k)
+            ? get_lengths(i_perm, 1, 1, shape_seqlen_q, max_seqlen_k)
            : std::array<ck_tile::index_t, 4>{1, 1, 1, 1} /* dummy shape for simplifying code */);

    ck_tile::HostTensor<SaccDataType> alibi_slope_host(
@@ -819,7 +803,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
                  << (is_rotary_interleaved ? "inter" : "half") << ")";
    }
 #endif
-#if CK_TILE_FMHA_FWD_SPLITKV_API
+#if CK_TILE_FMHA_FWD_SPLITKV_API || CK_TILE_FMHA_FWD_PAGEDKV_API
    if(1 < num_splits)
    {
        std::cout << ", num_splits:" << num_splits;
@@ -850,6 +834,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
        else // fmha_fwd_traits or fmha_splitkv_traits
        {
            traits.is_group_mode       = (mode == mode_enum::group);
+            traits.has_logits_soft_cap = 0.f < logits_soft_cap;
            traits.mask_type           = mask.type;
            traits.bias_type           = bias.type;
            traits.has_lse             = lse;
@@ -859,6 +844,11 @@ bool run(const ck_tile::ArgParser& arg_parser)
            {
                traits.has_dropout = (p_drop > 0.0f);
            }
+            else if constexpr(std::is_same_v<fmha_fwd_pagedkv_traits,
+                                             std::decay_t<decltype(traits)>>)
+            {
+                traits.use_pagedkv = use_kvcache;
+            }
        }
    };

@@ -884,7 +874,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
            else
                return i_perm ? seqlen_knew : nhead_k * seqlen_knew;
        }();
-        const ck_tile::index_t stride_bias    = (i_perm ? shape_seqlen_k : 1 * shape_seqlen_k);
+        const ck_tile::index_t stride_bias    = (i_perm ? max_seqlen_k : 1 * max_seqlen_k);
        const ck_tile::index_t stride_randval = (max_seqlen_k);
        const ck_tile::index_t stride_o_acc   = (hdim_v);
        const ck_tile::index_t stride_o       = (o_perm ? hdim_v : nhead * hdim_v);
@@ -909,7 +899,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
                return i_perm ? hdim_v * seqlen_knew : seqlen_knew;
        }();
        const ck_tile::index_t nhead_stride_bias =
-            (i_perm ? 0 * shape_seqlen_q * shape_seqlen_k : 0 * shape_seqlen_k);
+            (i_perm ? 0 * shape_seqlen_q * max_seqlen_k : 0 * max_seqlen_k);
        const ck_tile::index_t nhead_stride_randval = (shape_seqlen_q * max_seqlen_k);
        const ck_tile::index_t nhead_stride_lse     = shape_seqlen_q;
        const ck_tile::index_t nhead_stride_lse_acc = (num_splits * shape_seqlen_q);
@@ -925,7 +915,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
            (0 < page_block_size ? (nhead_k * hdim_v * page_block_size)
                                 : (nhead_k * hdim_v * shape_seqlen_k));
        const ck_tile::index_t batch_stride_vnew    = (nhead_k * hdim_v * seqlen_knew);
-        const ck_tile::index_t batch_stride_bias    = (0 * nhead * shape_seqlen_q * shape_seqlen_k);
+        const ck_tile::index_t batch_stride_bias    = (0 * nhead * shape_seqlen_q * max_seqlen_k);
        const ck_tile::index_t batch_stride_randval = (nhead * shape_seqlen_q * max_seqlen_k);
        const ck_tile::index_t batch_stride_lse     = (nhead * shape_seqlen_q);
        const ck_tile::index_t batch_stride_lse_acc = (nhead * num_splits * shape_seqlen_q);
@@ -1007,6 +997,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
            args.scale_p = scale_p;
            args.scale_o = scale_o;

+            args.logits_soft_cap = logits_soft_cap;
+
            args.stride_bias =
                (bias.type == bias_enum::alibi ? (bias.rank_info == 0 ? 0 : nhead) : stride_bias);
            args.stride_o          = stride_o;
@@ -1065,6 +1057,17 @@ bool run(const ck_tile::ArgParser& arg_parser)
                args.split_stride_lse_acc = split_stride_lse_acc;
                args.split_stride_o_acc   = split_stride_o_acc;
            }
+            else if constexpr(std::is_same_v<fmha_fwd_pagedkv_args, std::decay_t<decltype(args)>>)
+            {
+                args.block_table_ptr =
+                    (0 < page_block_size ? block_table_buf.GetDeviceBuffer() : nullptr);
+                args.batch_stride_block_table = batch_stride_block_table;
+                args.page_block_size          = page_block_size;
+                args.is_gappy = false; // use 'false' for flash-attention integration
+
+                args.cache_batch_idx =
+                    (use_cache_batch_idx ? cache_batch_idx_buf.GetDeviceBuffer() : nullptr);
+            }
        }
    };

@@ -1086,7 +1089,7 @@ bool run(const ck_tile::ArgParser& arg_parser)

    const float fwd_ave_time = [&] {
 #if CK_TILE_FMHA_FWD_SPLITKV_API
-        if(1 < num_splits || use_kvcache)
+        if(1 < num_splits && use_kvcache)
        {
            fmha_fwd_splitkv_traits fmha_splitkv_traits;
            init_traits(fmha_splitkv_traits);
@@ -1096,6 +1099,18 @@ bool run(const ck_tile::ArgParser& arg_parser)

            return fmha_fwd_splitkv(fmha_splitkv_traits, fmha_splitkv_args, stream_config);
        }
+#endif
+#if CK_TILE_FMHA_FWD_PAGEDKV_API
+        if(use_kvcache)
+        {
+            fmha_fwd_pagedkv_traits fmha_pagedkv_traits;
+            init_traits(fmha_pagedkv_traits);
+
+            fmha_fwd_pagedkv_args fmha_pagedkv_args;
+            init_args(fmha_pagedkv_args);
+
+            return fmha_fwd_pagedkv(fmha_pagedkv_traits, fmha_pagedkv_args, stream_config);
+        }
 #endif
        fmha_fwd_traits fmha_traits;
        init_traits(fmha_traits);
@@ -1120,7 +1135,7 @@ bool run(const ck_tile::ArgParser& arg_parser)

    std::cout << std::fixed << ", " << std::setprecision(3) << ave_time << " ms, "
              << std::setprecision(2) << tflops << " TFlops, " << std::setprecision(2) << gb_per_sec
-              << " GB/s" << std::flush;
+              << " GB/s" << std::flush << std::endl;

    if(do_validation == 0)
    {
@@ -1251,7 +1266,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
            q_host_ref.ForEach([&](auto& self, auto i) { self(i) = q_host_ref_ro(i); });
        }
 #endif
-#if CK_TILE_FMHA_FWD_SPLITKV_API
+#if CK_TILE_FMHA_FWD_SPLITKV_API || CK_TILE_FMHA_FWD_PAGEDKV_API
        if(0 < page_block_size) {
            if(i_perm) {
                k_host_ref.ForEach([&](auto& self, auto i) {
@@ -1302,7 +1317,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
            });
        }
 #endif
-#if CK_TILE_FMHA_FWD_SPLITKV_API
+#if CK_TILE_FMHA_FWD_SPLITKV_API || CK_TILE_FMHA_FWD_PAGEDKV_API
        if(0 < page_block_size) {
            if(is_v_rowmajor) {
                if(i_perm) {
@@ -1375,15 +1390,25 @@ bool run(const ck_tile::ArgParser& arg_parser)
            ck_tile::identity{},
            ck_tile::scales(scale_s));

+        if(0.f < logits_soft_cap)
+        {
+            ck_tile::reference_unary_elementwise<SaccDataType, SaccDataType, SaccDataType>(
+                s_host_ref, s_host_ref, [logits_soft_cap](SaccDataType logits) {
+                    return ck_tile::type_convert<SaccDataType>(
+                        logits_soft_cap *
+                        std::tanhf(ck_tile::type_convert<float>(logits / logits_soft_cap)));
+                });
+        }
+
        if(bias.type == bias_enum::elementwise_bias)
        {
            // elementwise bias
            ck_tile::HostTensor<BiasDataType> bias_host_ref({1, real_seqlen_q, real_seqlen_k});
            // clang-format off
            if(i_perm)
-                bias_host_ref.ForEach([&](auto& self, auto i) { self(i) = bias_host(0, 0, i[1] + query_offset, i[2] + key_offset); });
+                bias_host_ref.ForEach([&](auto& self, auto i) { self(i) = bias_host(0, 0, i[1] + query_offset, i[2]); });
            else
-                bias_host_ref.ForEach([&](auto& self, auto i) { self(i) = bias_host(0, i[1] + query_offset, 0, i[2] + key_offset); });
+                bias_host_ref.ForEach([&](auto& self, auto i) { self(i) = bias_host(0, i[1] + query_offset, 0, i[2]); });
            // clang-format on

            // broadcast from [1, real_seqlen_q, real_seqlen_k] to [nhead, real_seqlen_q,
--- a/example/ck_tile/01_fmha/fmha_fwd.hpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.hpp
@@ -4,6 +4,7 @@
 #pragma once

 #include "ck_tile/core.hpp"
+#include "ck_tile/host/device_prop.hpp"
 #include "ck_tile/host/kernel_launch.hpp"
 #include "ck_tile/ops/epilogue.hpp"
 #include "ck_tile/ops/fmha.hpp"
@@ -143,6 +144,8 @@ struct fmha_fwd_args
    float scale_p;
    float scale_o;

+    float logits_soft_cap;
+
    ck_tile::index_t stride_q;
    ck_tile::index_t stride_k;
    ck_tile::index_t stride_v;
@@ -167,6 +170,7 @@ struct fmha_fwd_args
    ck_tile::index_t window_size_left;
    ck_tile::index_t window_size_right;
    ck_tile::index_t mask_type;
+    ck_tile::index_t min_seqlen_q;

    float p_drop;
    bool s_randval;
@@ -175,6 +179,86 @@ struct fmha_fwd_args
        drop_seed_offset;
 };

+struct fmha_fwd_pagedkv_args
+{
+    const void* q_ptr;
+    const void* k_ptr;
+    const void* v_ptr;
+    const void* bias_ptr; // bias or alibi_slope pointer
+    void* lse_ptr;
+    void* o_ptr;
+
+    void* block_table_ptr;
+    ck_tile::index_t batch_stride_block_table; // only used if 'block_table_ptr' is not nullptr
+    ck_tile::index_t page_block_size;          // only used if 'block_table_ptr' is not nullptr
+    bool is_gappy; // differentiate seqstart_k_ptr usage. only used if 'block_table_ptr' is not
+                   // nullptr.
+
+    const void* cache_batch_idx;
+
+    // the real seqlen_q & seqlen_k are decided by following:
+    // batch mode: seqlen_q = kargs.seqlen_q
+    //             seqlen_k = kargs.seqlen_k
+    // group mode: seqlen_q = kargs.seqstart_q_ptr[b + 1] - kargs.seqstart_q_ptr[b]
+    //             seqlen_k = kargs.seqstart_k_ptr[b + 1] - kargs.seqstart_k_ptr[b]
+    //                      or kargs.seqlen_k_ptr[b]
+    //
+    // batch mode (kvcache):
+    //             seqlen_q = kargs.seqlen_q
+    //             seqlen_k = kargs.seqlen_k_ptr[b]
+    // group mode (kvcache):
+    //             seqlen_q = kargs.seqstart_q_ptr[b + 1] - kargs.seqstart_q_ptr[b]
+    //
+    //     when is_gappy=true:
+    //             seqlen_k = kargs.seqlen_k_ptr[b]
+    //             seqstart_k_ptr[b] now store local offset of each batch
+    //
+    //     when is_gappy=false:
+    //             seqlen_k = kargs.seqstart_k_ptr[b + 1] - kargs.seqstart_k_ptr[b]
+    //                      or kargs.seqlen_k_ptr[b]
+    const void* seqstart_q_ptr;
+    const void* seqstart_k_ptr;
+    const void* seqlen_k_ptr;
+
+    ck_tile::index_t seqlen_q;
+    ck_tile::index_t seqlen_k;
+    ck_tile::index_t batch;
+    ck_tile::index_t max_seqlen_q;
+    ck_tile::index_t hdim_q;
+    ck_tile::index_t hdim_v;
+    ck_tile::index_t nhead_q;
+    ck_tile::index_t nhead_k;
+
+    float scale_s;
+    float scale_p;
+    float scale_o;
+
+    float logits_soft_cap;
+
+    ck_tile::index_t stride_q;
+    ck_tile::index_t stride_k;
+    ck_tile::index_t stride_v;
+    ck_tile::index_t stride_bias; // if alibi, b*h need set this to h, 1*h need set this to 0
+    ck_tile::index_t stride_o;
+    ck_tile::index_t nhead_stride_q;
+    ck_tile::index_t nhead_stride_k;
+    ck_tile::index_t nhead_stride_v;
+    ck_tile::index_t nhead_stride_bias;
+    ck_tile::index_t nhead_stride_lse;
+    ck_tile::index_t nhead_stride_o;
+    ck_tile::index_t batch_stride_q;
+    ck_tile::index_t batch_stride_k;
+    ck_tile::index_t batch_stride_v;
+    ck_tile::index_t batch_stride_bias;
+    ck_tile::index_t batch_stride_lse;
+    ck_tile::index_t batch_stride_o;
+
+    ck_tile::index_t window_size_left;
+    ck_tile::index_t window_size_right;
+    ck_tile::index_t mask_type;
+    ck_tile::index_t min_seqlen_q;
+};
+
 struct fmha_fwd_splitkv_args
 {
    const void* q_ptr;
@@ -232,6 +316,8 @@ struct fmha_fwd_splitkv_args
    float scale_p;
    float scale_o;

+    float logits_soft_cap;
+
    ck_tile::index_t stride_q;
    ck_tile::index_t stride_k;
    ck_tile::index_t stride_v;
@@ -308,6 +394,85 @@ struct fmha_fwd_appendkv_args
    ck_tile::index_t batch_stride_vnew;
 };

+struct fmha_batch_prefill_args
+{
+    const void* q_ptr;
+    const void* k_ptr;
+    const void* v_ptr;
+    const void* bias_ptr; // bias or alibi_slope pointer
+    void* rand_val_ptr;
+    void* lse_ptr;
+    void* o_ptr;
+
+    // the real seqlen_q & seqlen_k are decided by following:
+    // batch mode (kvcache):
+    //             seqlen_q = kargs.seqlen_q
+    //             seqlen_k = kargs.page_block_size * (kargs.kv_indptr[b + 1] - kargs.kv_indptr[b] -
+    //             1) +
+    //                        kargs.kv_last_page_lens[b]
+    // group mode (kvcache):
+    //             seqlen_q = kargs.seqstart_q_ptr[b + 1] - kargs.seqstart_q_ptr[b]
+    //             seqlen_k = kargs.page_block_size * (kargs.kv_indptr[b + 1] - kargs.kv_indptr[b] -
+    //             1) +
+    //                        kargs.kv_last_page_lens[b]
+    const void* seqstart_q_ptr;
+
+    ck_tile::index_t seqlen_q;
+    ck_tile::index_t seqlen_k;
+    ck_tile::index_t batch;
+    ck_tile::index_t max_seqlen_q;
+    ck_tile::index_t hdim_q;
+    ck_tile::index_t hdim_v;
+    ck_tile::index_t nhead_q;
+    ck_tile::index_t nhead_k;
+
+    // SGLang-style page table
+    int32_t num_total_pages;
+    void* kv_indptr;
+    void* kv_page_indices;
+#if 0 // we assume page_block_size=1 for now
+    void* kv_last_page_lens;
+    ck_tile::index_t page_block_size;
+#endif
+
+    float scale_s;
+    float scale_p;
+    float scale_o;
+
+    float logits_soft_cap;
+
+    ck_tile::index_t stride_q;
+    ck_tile::index_t stride_k;
+    ck_tile::index_t stride_v;
+    ck_tile::index_t stride_bias; // if alibi, b*h need set this to h, 1*h need set this to 0
+    ck_tile::index_t stride_randval;
+    ck_tile::index_t stride_o;
+    ck_tile::index_t nhead_stride_q;
+    ck_tile::index_t nhead_stride_k;
+    ck_tile::index_t nhead_stride_v;
+    ck_tile::index_t nhead_stride_bias;
+    ck_tile::index_t nhead_stride_randval;
+    ck_tile::index_t nhead_stride_lse;
+    ck_tile::index_t nhead_stride_o;
+    ck_tile::index_t batch_stride_q;
+    ck_tile::index_t batch_stride_k;
+    ck_tile::index_t batch_stride_v;
+    ck_tile::index_t batch_stride_bias;
+    ck_tile::index_t batch_stride_randval;
+    ck_tile::index_t batch_stride_lse;
+    ck_tile::index_t batch_stride_o;
+
+    ck_tile::index_t window_size_left;
+    ck_tile::index_t window_size_right;
+    ck_tile::index_t mask_type;
+
+    float p_drop;
+    bool s_randval;
+
+    std::variant<std::pair<uint64_t, uint64_t>, std::pair<const void*, const void*>>
+        drop_seed_offset;
+};
+
 template <typename FmhaKernel>
 auto fmha_fwd_create_kargs_and_grids(fmha_fwd_args args)
 {
@@ -333,6 +498,7 @@ auto fmha_fwd_create_kargs_and_grids(fmha_fwd_args args)
                                             args.scale_s,
                                             args.scale_p,
                                             args.scale_o,
+                                             args.logits_soft_cap,
                                             args.stride_q,
                                             args.stride_k,
                                             args.stride_v,
@@ -349,6 +515,7 @@ auto fmha_fwd_create_kargs_and_grids(fmha_fwd_args args)
                                             args.window_size_left,
                                             args.window_size_right,
                                             args.mask_type,
+                                             args.min_seqlen_q,
                                             args.p_drop,
                                             args.s_randval,
                                             args.drop_seed_offset);
@@ -371,6 +538,7 @@ auto fmha_fwd_create_kargs_and_grids(fmha_fwd_args args)
                                             args.scale_s,
                                             args.scale_p,
                                             args.scale_o,
+                                             args.logits_soft_cap,
                                             args.stride_q,
                                             args.stride_k,
                                             args.stride_v,
@@ -414,6 +582,114 @@ auto fmha_fwd_create_kargs_and_grids(fmha_fwd_args args)
    }
 }

+template <typename FmhaKernel>
+auto fmha_fwd_pagedkv_create_kargs_and_grids(fmha_fwd_pagedkv_args args)
+{
+    assert(args.nhead_q % args.nhead_k == 0);
+    auto kargs = [&] {
+        // create group mode kernel arguments
+        if constexpr(FmhaKernel::kIsGroupMode)
+        {
+            return FmhaKernel::MakeKargs(args.q_ptr,
+                                         args.k_ptr,
+                                         args.v_ptr,
+                                         args.bias_ptr,
+                                         args.lse_ptr,
+                                         args.o_ptr,
+                                         args.seqstart_q_ptr,
+                                         args.seqstart_k_ptr,
+                                         args.seqlen_k_ptr,
+                                         args.hdim_q,
+                                         args.hdim_v,
+                                         args.nhead_q,
+                                         args.nhead_q / args.nhead_k,
+                                         args.block_table_ptr,
+                                         args.batch_stride_block_table,
+                                         args.page_block_size,
+                                         args.is_gappy,
+                                         args.scale_s,
+                                         args.scale_p,
+                                         args.scale_o,
+                                         args.logits_soft_cap,
+                                         args.stride_q,
+                                         args.stride_k,
+                                         args.stride_v,
+                                         args.stride_bias,
+                                         args.stride_o,
+                                         args.nhead_stride_q,
+                                         args.nhead_stride_k,
+                                         args.nhead_stride_v,
+                                         args.nhead_stride_bias,
+                                         args.nhead_stride_lse,
+                                         args.nhead_stride_o,
+                                         args.batch_stride_k,
+                                         args.batch_stride_v,
+                                         args.window_size_left,
+                                         args.window_size_right,
+                                         args.mask_type,
+                                         args.min_seqlen_q);
+        }
+        else
+        { // create batch mode kernel arguments
+            return FmhaKernel::MakeKargs(args.q_ptr,
+                                         args.k_ptr,
+                                         args.v_ptr,
+                                         args.bias_ptr,
+                                         args.lse_ptr,
+                                         args.o_ptr,
+                                         args.seqlen_q,
+                                         args.seqlen_k,
+                                         args.seqlen_k_ptr,
+                                         args.hdim_q,
+                                         args.hdim_v,
+                                         args.nhead_q,
+                                         args.nhead_q / args.nhead_k,
+                                         args.block_table_ptr,
+                                         args.batch_stride_block_table,
+                                         args.page_block_size,
+                                         args.cache_batch_idx,
+                                         args.scale_s,
+                                         args.scale_p,
+                                         args.scale_o,
+                                         args.logits_soft_cap,
+                                         args.stride_q,
+                                         args.stride_k,
+                                         args.stride_v,
+                                         args.stride_bias,
+                                         args.stride_o,
+                                         args.nhead_stride_q,
+                                         args.nhead_stride_k,
+                                         args.nhead_stride_v,
+                                         args.nhead_stride_bias,
+                                         args.nhead_stride_lse,
+                                         args.nhead_stride_o,
+                                         args.batch_stride_q,
+                                         args.batch_stride_k,
+                                         args.batch_stride_v,
+                                         args.batch_stride_bias,
+                                         args.batch_stride_lse,
+                                         args.batch_stride_o,
+                                         args.window_size_left,
+                                         args.window_size_right,
+                                         args.mask_type);
+        }
+    }();
+
+    // FmhaKernel::PrintParameters(kargs, args.batch);
+    if constexpr(FmhaKernel::kIsGroupMode)
+    {
+        dim3 grids = FmhaKernel::GridSize(
+            args.batch, args.nhead_q, args.max_seqlen_q, args.hdim_v, args.seqlen_k_ptr != nullptr);
+        return ck_tile::make_tuple(kargs, grids);
+    }
+    else
+    {
+        dim3 grids =
+            FmhaKernel::GridSize(args.batch, args.nhead_q, args.max_seqlen_q, args.hdim_v, false);
+        return ck_tile::make_tuple(kargs, grids);
+    }
+}
+
 template <typename Kernel>
 auto fmha_fwd_splitkv_create_kargs_and_grids(fmha_fwd_splitkv_args args)
 {
@@ -443,6 +719,7 @@ auto fmha_fwd_splitkv_create_kargs_and_grids(fmha_fwd_splitkv_args args)
                                     args.is_gappy,
                                     args.scale_s,
                                     args.scale_p,
+                                     args.logits_soft_cap,
                                     args.stride_q,
                                     args.stride_k,
                                     args.stride_v,
@@ -485,6 +762,7 @@ auto fmha_fwd_splitkv_create_kargs_and_grids(fmha_fwd_splitkv_args args)
                                     args.cache_batch_idx,
                                     args.scale_s,
                                     args.scale_p,
+                                     args.logits_soft_cap,
                                     args.stride_q,
                                     args.stride_k,
                                     args.stride_v,
@@ -618,6 +896,117 @@ auto fmha_fwd_appendkv_create_kargs_and_grids(fmha_fwd_appendkv_args args)
    return ck_tile::make_tuple(kargs, grids);
 }

+template <typename FmhaKernel>
+auto fmha_batch_prefill_create_kargs_and_grids(fmha_batch_prefill_args args)
+{
+    assert(args.nhead_q % args.nhead_k == 0);
+    auto kargs = [&] {
+        // create group mode kernel arguments
+        if constexpr(FmhaKernel::kIsGroupMode)
+        {
+            return FmhaKernel::MakeKargs(args.q_ptr,
+                                         args.k_ptr,
+                                         args.v_ptr,
+                                         args.bias_ptr,
+                                         args.rand_val_ptr,
+                                         args.lse_ptr,
+                                         args.o_ptr,
+                                         args.seqstart_q_ptr,
+                                         args.hdim_q,
+                                         args.hdim_v,
+                                         args.nhead_q,
+                                         args.nhead_q / args.nhead_k,
+                                         args.num_total_pages,
+                                         args.kv_indptr,
+                                         args.kv_page_indices,
+#if 0 // we assume page_block_size=1 for now
+                                         args.kv_last_page_lens,
+                                         args.page_block_size,
+#endif
+                                         args.scale_s,
+                                         args.scale_p,
+                                         args.scale_o,
+                                         args.logits_soft_cap,
+                                         args.stride_q,
+                                         args.stride_k,
+                                         args.stride_v,
+                                         args.stride_bias,
+                                         args.stride_randval,
+                                         args.stride_o,
+                                         args.nhead_stride_q,
+                                         args.nhead_stride_k,
+                                         args.nhead_stride_v,
+                                         args.nhead_stride_bias,
+                                         args.nhead_stride_randval,
+                                         args.nhead_stride_lse,
+                                         args.nhead_stride_o,
+                                         args.batch_stride_k,
+                                         args.batch_stride_v,
+                                         args.window_size_left,
+                                         args.window_size_right,
+                                         args.mask_type,
+                                         args.p_drop,
+                                         args.s_randval,
+                                         args.drop_seed_offset);
+        }
+        else
+        { // create batch mode kernel arguments
+            return FmhaKernel::MakeKargs(args.q_ptr,
+                                         args.k_ptr,
+                                         args.v_ptr,
+                                         args.bias_ptr,
+                                         args.rand_val_ptr,
+                                         args.lse_ptr,
+                                         args.o_ptr,
+                                         args.seqlen_q,
+                                         args.hdim_q,
+                                         args.hdim_v,
+                                         args.nhead_q,
+                                         args.nhead_q / args.nhead_k,
+                                         args.num_total_pages,
+                                         args.kv_indptr,
+                                         args.kv_page_indices,
+#if 0 // we assume page_block_size=1 for now
+                                         args.kv_last_page_lens,
+                                         args.page_block_size,
+#endif
+                                         args.scale_s,
+                                         args.scale_p,
+                                         args.scale_o,
+                                         args.logits_soft_cap,
+                                         args.stride_q,
+                                         args.stride_k,
+                                         args.stride_v,
+                                         args.stride_bias,
+                                         args.stride_randval,
+                                         args.stride_o,
+                                         args.nhead_stride_q,
+                                         args.nhead_stride_k,
+                                         args.nhead_stride_v,
+                                         args.nhead_stride_bias,
+                                         args.nhead_stride_randval,
+                                         args.nhead_stride_lse,
+                                         args.nhead_stride_o,
+                                         args.batch_stride_q,
+                                         args.batch_stride_k,
+                                         args.batch_stride_v,
+                                         args.batch_stride_bias,
+                                         args.batch_stride_randval,
+                                         args.batch_stride_lse,
+                                         args.batch_stride_o,
+                                         args.window_size_left,
+                                         args.window_size_right,
+                                         args.mask_type,
+                                         args.p_drop,
+                                         args.s_randval,
+                                         args.drop_seed_offset);
+        }
+    }();
+
+    dim3 grids = FmhaKernel::GridSize(args.batch, args.nhead_q, args.max_seqlen_q, args.hdim_v);
+    return ck_tile::make_tuple(kargs, grids);
+}
+
 // this is used to pattern-match internl kernel implementation, not to instantiate kernel
 template <ck_tile::index_t HDim_,
          typename DataType_,
@@ -630,6 +1019,7 @@ template <ck_tile::index_t HDim_,
          ck_tile::index_t kK0BlockLength_,
          bool kIsVLayoutRowMajor_,
          ck_tile::BlockFmhaPipelineEnum FmhaPipelineEnum_,
+          bool kHasLogitsSoftCap_,
          typename FmhaMask_,
          ck_tile::BlockAttentionBiasEnum BiasEnum_,
          bool kStoreLse_,
@@ -638,7 +1028,9 @@ template <ck_tile::index_t HDim_,
          bool kPadS_,
          bool kPadSK_,
          bool kPadD_,
-          bool kPadDv_>
+          bool kPadDv_,
+          bool kUseTrLoad_,
+          bool kSkipMinSeqlenQ_ = false>
 struct fmha_fwd_traits_
 {
    static constexpr ck_tile::index_t HDim           = HDim_;
@@ -652,6 +1044,7 @@ struct fmha_fwd_traits_
    static constexpr ck_tile::index_t kK0BlockLength = kK0BlockLength_;
    static constexpr bool kIsVLayoutRowMajor         = kIsVLayoutRowMajor_;
    static constexpr auto FmhaPipelineEnum           = FmhaPipelineEnum_;
+    static constexpr bool kHasLogitsSoftCap          = kHasLogitsSoftCap_;
    using FmhaMask                                   = ck_tile::remove_cvref_t<FmhaMask_>;
    static constexpr auto BiasEnum                   = BiasEnum_;
    static constexpr bool kStoreLse                  = kStoreLse_;
@@ -661,6 +1054,8 @@ struct fmha_fwd_traits_
    static constexpr bool kPadSK                     = kPadSK_;
    static constexpr bool kPadD                      = kPadD_;
    static constexpr bool kPadDv                     = kPadDv_;
+    static constexpr bool kUseTrLoad                 = kUseTrLoad_;
+    static constexpr bool kSkipMinSeqlenQ            = kSkipMinSeqlenQ_;
 };

 template <typename Traits_>
@@ -677,6 +1072,58 @@ template <ck_tile::index_t HDim_,
          ck_tile::index_t kK0BlockLength_,
          bool kIsVLayoutRowMajor_,
          ck_tile::BlockFmhaPipelineEnum FmhaPipelineEnum_,
+          bool kHasLogitsSoftCap_,
+          typename FmhaMask_,
+          ck_tile::BlockAttentionBiasEnum BiasEnum_,
+          bool kStoreLse_,
+          bool kIsPagedKV_,
+          bool kDoFp8StaticQuant_,
+          bool kPadS_,
+          bool kPadSK_,
+          bool kPadD_,
+          bool kPadDv_,
+          bool kSkipMinSeqlenQ_ = false>
+struct fmha_fwd_pagedkv_traits_
+{
+    static constexpr ck_tile::index_t HDim           = HDim_;
+    using DataType                                   = ck_tile::remove_cvref_t<DataType_>;
+    static constexpr bool kIsGroupMode               = kIsGroupMode_;
+    static constexpr ck_tile::index_t kM0            = kM0_;
+    static constexpr ck_tile::index_t kN0            = kN0_;
+    static constexpr ck_tile::index_t kK0            = kK0_;
+    static constexpr ck_tile::index_t kN1            = kN1_;
+    static constexpr ck_tile::index_t kK1            = kK1_;
+    static constexpr ck_tile::index_t kK0BlockLength = kK0BlockLength_;
+    static constexpr bool kIsVLayoutRowMajor         = kIsVLayoutRowMajor_;
+    static constexpr auto FmhaPipelineEnum           = FmhaPipelineEnum_;
+    static constexpr bool kHasLogitsSoftCap          = kHasLogitsSoftCap_;
+    using FmhaMask                                   = ck_tile::remove_cvref_t<FmhaMask_>;
+    static constexpr auto BiasEnum                   = BiasEnum_;
+    static constexpr bool kStoreLse                  = kStoreLse_;
+    static constexpr bool kIsPagedKV                 = kIsPagedKV_;
+    static constexpr bool kDoFp8StaticQuant          = kDoFp8StaticQuant_;
+    static constexpr bool kPadS                      = kPadS_;
+    static constexpr bool kPadSK                     = kPadSK_;
+    static constexpr bool kPadD                      = kPadD_;
+    static constexpr bool kPadDv                     = kPadDv_;
+    static constexpr bool kSkipMinSeqlenQ            = kSkipMinSeqlenQ_;
+};
+
+template <typename Traits_>
+float fmha_fwd_pagedkv_(const ck_tile::stream_config&, fmha_fwd_pagedkv_args);
+
+template <ck_tile::index_t HDim_,
+          typename DataType_,
+          bool kIsGroupMode_,
+          ck_tile::index_t kM0_,
+          ck_tile::index_t kN0_,
+          ck_tile::index_t kK0_,
+          ck_tile::index_t kN1_,
+          ck_tile::index_t kK1_,
+          ck_tile::index_t kK0BlockLength_,
+          bool kIsVLayoutRowMajor_,
+          ck_tile::BlockFmhaPipelineEnum FmhaPipelineEnum_,
+          bool kHasLogitsSoftCap_,
          typename FmhaMask_,
          ck_tile::BlockAttentionBiasEnum BiasEnum_,
          bool kStoreLse_,
@@ -699,6 +1146,7 @@ struct fmha_fwd_splitkv_traits_
    static constexpr ck_tile::index_t kK0BlockLength = kK0BlockLength_;
    static constexpr bool kIsVLayoutRowMajor         = kIsVLayoutRowMajor_;
    static constexpr auto FmhaPipelineEnum           = FmhaPipelineEnum_;
+    static constexpr bool kHasLogitsSoftCap          = kHasLogitsSoftCap_;
    using FmhaMask                                   = ck_tile::remove_cvref_t<FmhaMask_>;
    static constexpr auto BiasEnum                   = BiasEnum_;
    static constexpr bool kStoreLse                  = kStoreLse_;
@@ -776,6 +1224,9 @@ struct fmha_fwd_appendkv_traits_
 template <typename Traits_>
 float fmha_fwd_appendkv_(const ck_tile::stream_config&, fmha_fwd_appendkv_args);

+template <typename Traits_>
+float fmha_batch_prefill_(const ck_tile::stream_config&, fmha_batch_prefill_args);
+
 // This is the public API, will be generated by script
 struct fmha_fwd_traits
 {
@@ -784,15 +1235,38 @@ struct fmha_fwd_traits
    std::string data_type;
    bool is_group_mode;
    bool is_v_rowmajor;
+    bool has_logits_soft_cap;
    mask_enum mask_type;
    bias_enum bias_type; // 0:no bias, 1:elementwise bias, 2:alibi. sync with BlockAttentionBiasEnum
    bool has_lse;
    bool has_dropout;
    bool do_fp8_static_quant;
+    bool skip_min_seqlen_q = false;
    // TODO: padding check is inside this api
 };
 float fmha_fwd(fmha_fwd_traits, fmha_fwd_args, const ck_tile::stream_config&);

+struct fmha_fwd_pagedkv_traits
+{
+    int hdim_q;
+    int hdim_v;
+    std::string data_type;
+    bool is_group_mode;
+    bool is_v_rowmajor;
+    bool has_logits_soft_cap;
+    mask_enum mask_type;
+    bias_enum bias_type; // 0:no bias, 1:elementwise bias, 2:alibi. sync with BlockAttentionBiasEnum
+    bool has_lse             = false;
+    bool use_pagedkv         = true;
+    bool do_fp8_static_quant = false;
+    bool skip_min_seqlen_q   = false;
+    // TODO: padding check is inside this api
+};
+
+float fmha_fwd_pagedkv(fmha_fwd_pagedkv_traits&,
+                       fmha_fwd_pagedkv_args&,
+                       const ck_tile::stream_config&);
+
 struct fmha_fwd_splitkv_traits
 {
    int hdim_q;
@@ -800,6 +1274,7 @@ struct fmha_fwd_splitkv_traits
    std::string data_type;
    bool is_group_mode;
    bool is_v_rowmajor;
+    bool has_logits_soft_cap;
    mask_enum mask_type;
    bias_enum bias_type; // 0:no bias, 1:elementwise bias, 2:alibi. sync with BlockAttentionBiasEnum
    bool has_lse;
@@ -821,3 +1296,8 @@ struct fmha_fwd_appendkv_traits
 float fmha_fwd_appendkv(fmha_fwd_appendkv_traits,
                        fmha_fwd_appendkv_args,
                        const ck_tile::stream_config&);
+
+using fmha_batch_prefill_traits = fmha_fwd_traits;
+float fmha_batch_prefill(fmha_batch_prefill_traits,
+                         fmha_batch_prefill_args,
+                         const ck_tile::stream_config&);
--- a/example/ck_tile/01_fmha/generate.py
+++ b/example/ck_tile/01_fmha/generate.py
@@ -21,8 +21,7 @@ class HandlerId(IntEnum):
 ops = []
 for importer, module_name, _ in pkgutil.iter_modules(codegen.ops.__path__):
    full_module_name = '%s.%s' % (codegen.ops.__name__, module_name)
-    if full_module_name not in sys.modules:
-        ops.append(importer.find_spec(module_name).loader.load_module(module_name))
+    ops.append(importer.find_spec(module_name).loader.load_module(module_name))
 unwanted_prefix = 'fmha_'
 handlers = dict(
    [(op.__name__[len(unwanted_prefix):] if op.__name__.startswith(unwanted_prefix) else op.__name__,
@@ -30,7 +29,7 @@ handlers = dict(
 )
 assert 0 < len(handlers)

-def write_blobs(output_dir: Optional[str], api_list : List[str], filters_list : List[str], receipt, mask_impl) -> None:
+def write_blobs(output_dir: Optional[str], api_list : List[str], filters_list : List[str], optdim_list : List[int], receipt, mask_impl) -> None:
    if output_dir is None:
        output_dir = Path(__file__).parent
    else:
@@ -40,10 +39,10 @@ def write_blobs(output_dir: Optional[str], api_list : List[str], filters_list :

    for api, kernel_filter in zip(api_list, filters_list):
        handler = handlers[api][HandlerId.WRITE_BLOBS]
-        handler(output_dir, kernel_filter, receipt, mask_impl)
+        handler(output_dir, kernel_filter, receipt, optdim_list, mask_impl)

 # list all the files that will be generated
-def list_blobs(output_file : Optional[str], api_list : List[str], filters_list : List[str], receipt, mask_impl) -> None:
+def list_blobs(output_file : Optional[str], api_list : List[str], filters_list : List[str], optdim_list : List[int], receipt, mask_impl) -> None:
    assert output_file is not None
    file_path = Path(output_file)

@@ -52,7 +51,7 @@ def list_blobs(output_file : Optional[str], api_list : List[str], filters_list :

    for api, kernel_filter in zip(api_list, filters_list):
        handler = handlers[api][HandlerId.LIST_BLOBS]
-        handler(file_path, kernel_filter, receipt, mask_impl)
+        handler(file_path, kernel_filter, receipt, optdim_list, mask_impl)

 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
@@ -109,16 +108,25 @@ if __name__ == "__main__":
             "  100-199: Only generate instance for Aiter(mha_fwd) integration\n" + \
             "  200-299: Only generate instance for Aiter(mha_varlen_fwd) integration\n" + \
             "  300-399: Only generate instance for Aiter(mha_bwd) integration\n" + \
-             "  400-499: Only generate instance for Aiter(mha_varlen_bwd) integration"
+             "  400-499: Only generate instance for Aiter(mha_varlen_bwd) integration\n" + \
+             "  600-699: Only generate instance for aiter::mha_fwd && aiter::mha_fwd_splitkv && aiter::mha_bwd C++ api integration"
+    )

+    parser.add_argument(
+        "--optdim",
+        default='-1',
+        required=False,
+        help="only optimize the hdim in the list. separated by comma. -1 is the default choice" + \
+              "eg. --optdim=32,64,128,256"
    )

    args = parser.parse_args()
    api_list = args.direction.split(',')
    filter_list = args.filter.split(',')
    filter_list.extend([''] * (len(api_list) - len(filter_list)))
+    optdim_list = [int(hdim) for hdim in args.optdim.split(',')]

    if args.list_blobs is not None:
-        list_blobs(args.list_blobs, api_list, filter_list, int(args.receipt), mask_impl=args.mask)
+        list_blobs(args.list_blobs, api_list, filter_list, optdim_list, int(args.receipt), mask_impl=args.mask)
    else:
-        write_blobs(args.output_dir, api_list, filter_list, int(args.receipt), mask_impl=args.mask)
+        write_blobs(args.output_dir, api_list, filter_list, optdim_list, int(args.receipt), mask_impl=args.mask)
--- a/example/ck_tile/01_fmha/mask.hpp
+++ b/example/ck_tile/01_fmha/mask.hpp
@@ -21,6 +21,8 @@ enum class mask_enum
 struct mask_info
 {
    mask_enum type;
+    ck_tile::index_t seqlen_q;
+    ck_tile::index_t seqlen_k;
    ck_tile::index_t y, x;
    ck_tile::index_t left, right; // FA style SWA left/right

@@ -42,6 +44,8 @@ struct mask_info
        ck_tile::index_t x_total = seqlen_k;
        ck_tile::index_t y_total = seqlen_q;
        mask_info tmp;
+        tmp.seqlen_q = seqlen_q;
+        tmp.seqlen_k = seqlen_k;
        auto found_0 = str.find(':');
        if(found_0 != std::string::npos)
        {
@@ -148,7 +152,22 @@ struct mask_info
        }
        return tmp;
    }
-
+    ck_tile::index_t get_unmaskarea() const
+    {
+        if(type == mask_enum::no_mask)
+            return seqlen_q * seqlen_k;
+        ck_tile::index_t area = 0;
+        for(ck_tile::index_t i_y = 0; i_y < seqlen_q; ++i_y)
+        {
+            ck_tile::index_t x_start = std::max(-y + i_y + 1, static_cast<ck_tile::index_t>(0));
+            ck_tile::index_t x_end   = std::min(i_y + x, seqlen_k);
+            if(x_end > x_start)
+            {
+                area += (x_end - x_start);
+            }
+        }
+        return area;
+    }
    friend std::ostream& operator<<(std::ostream& os, const mask_info& mi)
    {
        mi.serialize(os);
--- a/example/ck_tile/01_fmha/script/benchmark_fwd.sh
+++ b/example/ck_tile/01_fmha/script/benchmark_fwd.sh
@@ -18,14 +18,3 @@ $EXE -prec=$prec -b=1  -h=$nhead -d=$hdim -s=16384 -iperm=$perm -operm=$perm -kn
 done
 done
 done
-
-for perm in 0 1 ; do
-
-$EXE -prec=fp8 -squant=1 -b=32 -h=16 -d=128 -s=512   -iperm=$perm -operm=$perm -vlayout=c -range_q=240 -range_k=240 -range_v=240 -range_p=240 -range_o=240 -kname=1 -v=$VALID ; sleep 3
-$EXE -prec=fp8 -squant=1 -b=16 -h=16 -d=128 -s=1024  -iperm=$perm -operm=$perm -vlayout=c -range_q=240 -range_k=240 -range_v=240 -range_p=240 -range_o=240 -kname=1 -v=$VALID ; sleep 3
-$EXE -prec=fp8 -squant=1 -b=8  -h=16 -d=128 -s=2048  -iperm=$perm -operm=$perm -vlayout=c -range_q=240 -range_k=240 -range_v=240 -range_p=240 -range_o=240 -kname=1 -v=$VALID ; sleep 3
-$EXE -prec=fp8 -squant=1 -b=4  -h=16 -d=128 -s=4096  -iperm=$perm -operm=$perm -vlayout=c -range_q=240 -range_k=240 -range_v=240 -range_p=240 -range_o=240 -kname=1 -v=$VALID ; sleep 3
-$EXE -prec=fp8 -squant=1 -b=2  -h=16 -d=128 -s=8192  -iperm=$perm -operm=$perm -vlayout=c -range_q=240 -range_k=240 -range_v=240 -range_p=240 -range_o=240 -kname=1 -v=$VALID ; sleep 3
-$EXE -prec=fp8 -squant=1 -b=1  -h=16 -d=128 -s=16384 -iperm=$perm -operm=$perm -vlayout=c -range_q=240 -range_k=240 -range_v=240 -range_p=240 -range_o=240 -kname=1 -v=$VALID ; sleep 3
-
-done
--- a/example/ck_tile/01_fmha/script/smoke_test_fwd.sh
+++ b/example/ck_tile/01_fmha/script/smoke_test_fwd.sh
@@ -42,7 +42,6 @@ run_fp16_bf16_tests() {
    for prec in "fp16" "bf16" ; do
    for mode in 1 0 ; do
    for perm in 0 1 ; do
-    for vlayout in "r" "c" ; do
    for hdim in 32 64 128 256 ; do
    for lse in 0 1 ; do
    for bias in "n" "e" "a" ; do
@@ -51,16 +50,16 @@ run_fp16_bf16_tests() {
    for page_block_size in $PAGE_BLOCK_SIZE ; do
    for cache_batch_idx in $CACHE_BATCH_IDX ; do

-    # $EXE -prec=$prec -mode=$mode -b=1 -h=1 -d=$hdim -s=1024 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -kname=$KNAME $COMMON_ARGS  
-    $EXE -prec=$prec -mode=$mode -b=2 -h=2 -h_k=1 -d=16, -d_v=$hdim -s=55 -s_k=256 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
-    $EXE -prec=$prec -mode=$mode -b=1 -h=3 -d=$hdim -s=100 -s_k=51 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
-    $EXE -prec=$prec -mode=$mode -b=2 -h=1 -d=16 -d_v=$hdim -s=99 -s_k=256 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=1 -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
-    $EXE -prec=$prec -mode=$mode -b=1 -h=2 -h_k=1 -d=$hdim -s=1024 -s_k=256 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2 -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
-    $EXE -prec=$prec -mode=$mode -b=2 -h=1 -d=$hdim -d_v=24 -s=3 -s_k=99 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2 -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
-    $EXE -prec=$prec -mode=$mode -b=3 -h=2 -h_k=1 -d=$hdim -s=200 -s_k=520 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=t:128,30 -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
-    $EXE -prec=$prec -mode=$mode -b=2 -h=1 -d=$hdim -s=99 -s_k=32 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=b:4,35 -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
-    $EXE -prec=$prec -mode=$mode -b=1 -h=2 -h_k=1 -d=$hdim -s=33 -s_k=0 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2 -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
-    $EXE -prec=$prec -mode=$mode -b=1 -h=2 -h_k=1 -d=$hdim -s=1 -s_k=10 -s_kpad=32 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2 -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    # $EXE -prec=$prec -mode=$mode -b=1 -h=1 -d=$hdim -s=1024 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm  -num_splits=$num_splits -page_block_size=$page_block_size -kname=$KNAME $COMMON_ARGS  
+    $EXE -prec=$prec -mode=$mode -b=2 -h=2 -h_k=1 -d=16, -d_v=$hdim -s=55 -s_k=256 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm  -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    $EXE -prec=$prec -mode=$mode -b=1 -h=3 -d=$hdim -s=100 -s_k=51 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm  -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    $EXE -prec=$prec -mode=$mode -b=2 -h=1 -d=16 -d_v=$hdim -s=99 -s_k=256 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=1  -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    $EXE -prec=$prec -mode=$mode -b=1 -h=2 -h_k=1 -d=$hdim -s=1024 -s_k=256 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2  -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    $EXE -prec=$prec -mode=$mode -b=2 -h=1 -d=$hdim -d_v=24 -s=3 -s_k=99 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2  -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    $EXE -prec=$prec -mode=$mode -b=3 -h=2 -h_k=1 -d=$hdim -s=200 -s_k=520 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=t:128,30  -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    $EXE -prec=$prec -mode=$mode -b=2 -h=1 -d=$hdim -s=99 -s_k=32 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=b:4,35  -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    $EXE -prec=$prec -mode=$mode -b=1 -h=2 -h_k=1 -d=$hdim -s=33 -s_k=0 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2  -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  
+    $EXE -prec=$prec -mode=$mode -b=1 -h=2 -h_k=1 -d=$hdim -s=1 -s_k=10 -s_kpad=32 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2  -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS  

    done ; done ; done ; done ; done
    done ; done ; done ; done ; done
--- a/example/ck_tile/02_layernorm2d/CMakeLists.txt
+++ b/example/ck_tile/02_layernorm2d/CMakeLists.txt
@@ -25,7 +25,7 @@ add_custom_command(

 set(EXAMPLE_LAYERNORM2D_FWD "tile_example_layernorm2d_fwd")

-message("adding example ${EXAMPLE_LAYERNORM2D_FWD}")
+message(DEBUG "adding example ${EXAMPLE_LAYERNORM2D_FWD}")
 add_executable(${EXAMPLE_LAYERNORM2D_FWD} EXCLUDE_FROM_ALL layernorm2d_fwd.cpp)
 target_include_directories(${EXAMPLE_LAYERNORM2D_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
 target_sources(${EXAMPLE_LAYERNORM2D_FWD} PRIVATE ${LAYERNORM2D_FWD_GEN_BLOBS})
--- a/example/ck_tile/02_layernorm2d/generate.py
+++ b/example/ck_tile/02_layernorm2d/generate.py
@@ -75,22 +75,22 @@ struct layernorm2d_fwd_traits_
    using SmoothScaleDataType = ck_tile::remove_cvref_t<SmoothScaleDataType_>;
    using YScaleDataType = ck_tile::remove_cvref_t<YScaleDataType_>;

-    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= warpSize;
-    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % warpSize == 0);
+    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= ck_tile::get_warp_size();
+    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % ck_tile::get_warp_size() == 0);
    static constexpr ck_tile::index_t total_warps =
-        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / warpSize;
+        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / ck_tile::get_warp_size();

    // num of warps along m
    static constexpr ck_tile::index_t BlockWarps_M = []() {
        if constexpr(is_warp_per_row)
        {
-            static_assert(warpSize % ThreadPerBlock_N_ == 0);
-            return total_warps * (warpSize / ThreadPerBlock_N_);
+            static_assert(ck_tile::get_warp_size() % ThreadPerBlock_N_ == 0);
+            return total_warps * (ck_tile::get_warp_size() / ThreadPerBlock_N_);
        }
        else
        {
-            // static_assert(warpSize % ThreadPerBlock_M_ == 0);
-            return total_warps / (ThreadPerBlock_N_ / warpSize);
+            // static_assert(ck_tile::get_warp_size() % ThreadPerBlock_M_ == 0);
+            return total_warps / (ThreadPerBlock_N_ / ck_tile::get_warp_size());
        }
    }();

@@ -98,13 +98,13 @@ struct layernorm2d_fwd_traits_
    static constexpr ck_tile::index_t BlockWarps_N = []() {
        if constexpr(is_warp_per_row)
        {
-            static_assert(warpSize % ThreadPerBlock_N_ == 0);
+            static_assert(ck_tile::get_warp_size() % ThreadPerBlock_N_ == 0);
            return 1;
        }
        else
        {
-            static_assert(ThreadPerBlock_N_ % warpSize == 0);
-            return ThreadPerBlock_N_ / warpSize;
+            static_assert(ThreadPerBlock_N_ % ck_tile::get_warp_size() == 0);
+            return ThreadPerBlock_N_ / ck_tile::get_warp_size();
        }
    }();

--- a/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp
+++ b/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp
@@ -191,8 +191,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
        return base_str;
    }();

-    std::cout << "[" << prec_str << "]"
-              << " m:" << m << ", n:" << n << ", x_stride:" << x_stride
+    std::cout << "[" << prec_str << "]" << " m:" << m << ", n:" << n << ", x_stride:" << x_stride
              << ", xr_stride:" << xr_stride << ", y_stride:" << y_stride
              << ", yr_stride:" << yr_stride << std::flush;

--- a/example/ck_tile/03_gemm/CMakeLists.txt
+++ b/example/ck_tile/03_gemm/CMakeLists.txt
@@ -1,5 +1,18 @@
 add_executable(tile_example_gemm_basic EXCLUDE_FROM_ALL gemm_basic.cpp)
 add_executable(tile_example_gemm_universal EXCLUDE_FROM_ALL universal_gemm.cpp)
-target_compile_options(tile_example_gemm_universal PRIVATE
-  -mllvm -enable-noalias-to-md-conversion=0
-)
+add_executable(tile_example_gemm_weight_preshuffle EXCLUDE_FROM_ALL gemm_weight_preshuffle.cpp)
+add_executable(tile_example_gemm_reduce EXCLUDE_FROM_ALL gemm_splitk_two_stage_reduce.cpp)
+set(EXAMPLE_GEMM_COMPILE_OPTIONS)
+set(EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS)
+if(CK_USE_OCP_FP8)
+  list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
+endif()
+list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -mllvm -enable-noalias-to-md-conversion=0)
+list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS -Wno-unused-local-typedef)
+list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS -Wno-gnu-line-marker)
+list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS --save-temps)
+list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm -enable-noalias-to-md-conversion=0")
+target_compile_options(tile_example_gemm_basic PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+target_compile_options(tile_example_gemm_universal PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+target_compile_options(tile_example_gemm_weight_preshuffle PRIVATE ${EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS})
+target_compile_options(tile_example_gemm_reduce PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
--- a/example/ck_tile/03_gemm/README.md
+++ b/example/ck_tile/03_gemm/README.md
@@ -12,26 +12,29 @@ sh ../script/cmake-ck-dev.sh  ../ <arch>
 make tile_example_gemm_basic -j
 # The memory bound pipeline on the gemm calculation
 make tile_example_gemm_universal -j
+# The weight preshuffle pipeline on the gemm calculation
+make tile_example_gemm_weight_preshuffle -j
 ```
 This will result in an executable `build/bin/tile_example_gemm_basic` & `build/bin/tile_example_gemm_universal`

 ## example
 ```
 args:
-          -b    batch size (default:1)
          -m    m dimension (default:1024)
          -n    n dimension (default:2048)
          -k    k dimension (default:64)
   -a_layout    Tensor A data layout (default: R)
-   -b_layout    Tensor B data layout (default: R)
+   -b_layout    Tensor B data layout (default: C)
   -c_layout    Tensor C data layout (default: R)
   -stride_a    Tensor A stride (default:0)
   -stride_b    Tensor B stride (default:0)
   -stride_c    Tensor C stride (default:0)
          -v    0. No validation, 1. Validation on CPU, 2. Validation on GPU (default:2)
-          -e    Absolute error tolerance (default:1e-5)
-       -prec    data type. fp16/bf16/fp8/bf8 (default:fp16)
+       -prec    data type. fp16/bf16/fp8/bf8/int8 (default:fp16)
     -warmup    number of iterations before benchmark the kernel (default:10)
     -repeat    number of iterations to benchmark the kernel (default:100)
      -timer    gpu:gpu timer, cpu:cpu timer (default:gpu)
+    -split_k    splitK value (default:1)
+       -init    0:random, 1:linear, 2:constant (default:1)
+ -persistent    0:non-persistent, 1:persistent (default:0)
 ```
--- a/example/ck_tile/03_gemm/gemm_basic.cpp
+++ b/example/ck_tile/03_gemm/gemm_basic.cpp
@@ -1,32 +1,25 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.

-#include <hip/hip_runtime.h>
-
-#include <cstring>
-#include <iostream>
-#include <ostream>
-#include <string>
-#include <tuple>
-
-#include "ck_tile/host.hpp"
 #include "gemm_utils.hpp"

-template <typename ADataType,
+template <typename GemmConfig,
+          typename ADataType,
          typename BDataType,
+          typename DsDataType,
          typename AccDataType,
          typename CDataType,
          typename ALayout,
          typename BLayout,
-          typename CLayout>
-float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
-{
-    // The kPadM, kPadN, kPadK & kBlockPerCu should also come from the Codegen part.
-    constexpr bool kPadM = false;
-    constexpr bool kPadN = false;
-    constexpr bool kPadK = false;
+          typename DsLayout,
+          typename CLayout,
+          bool Persistent,
+          typename CDEElementWise>
+float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)

-    constexpr int kBlockPerCu = 1;
+{
+    if constexpr(Persistent)
+        std::cout << "WARNING: Ignoring persistent kernel option for basic gemm." << std::endl;

    // This part comes from the Codegen
    constexpr ck_tile::index_t M_Tile = 256;
@@ -48,61 +41,89 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&

    using TilePartitioner = ck_tile::GemmTile1DPartitioner<CodegenGemmShape>;

-    using CodegenGemmTraits =
-        ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
+    using CodegenGemmTraits = ck_tile::TileGemmTraits<GemmConfig::kPadM,
+                                                      GemmConfig::kPadN,
+                                                      GemmConfig::kPadK,
+                                                      ALayout,
+                                                      BLayout,
+                                                      CLayout>;
+
    using CodegenPipelineProblem = ck_tile::
        GemmPipelineProblem<ADataType, BDataType, AccDataType, CodegenGemmShape, CodegenGemmTraits>;
+
    using CodegenGemmPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem>;
-    using GemmEpilogue        = ck_tile::CShuffleEpilogue<
-        ck_tile::CShuffleEpilogueProblem<ADataType,
-                                         BDataType,
-                                         AccDataType,
-                                         CDataType,
-                                         CLayout,
-                                         CodegenPipelineProblem::kBlockSize,
-                                         TilePartitioner::MPerBlock,
-                                         TilePartitioner::NPerBlock,
-                                         M_Warp,
-                                         N_Warp,
-                                         M_Warp_Tile,
-                                         N_Warp_Tile,
-                                         K_Warp_Tile,
-                                         CodegenPipelineProblem::TransposeC>>;
-    // ToDo: Will add the codegen part to test different pipeline policies in GEMM.
-    // Now we only use the BlockGemmASmemBSmemCRegV1DefaultPolicy.
-    using Kernel = ck_tile::GemmKernel<TilePartitioner, CodegenGemmPipeline, GemmEpilogue>;

-    auto kargs = Kernel::MakeKernelArgs(args);
+    const auto Run = [&](const auto memory_operation_) {
+        constexpr auto memory_operation = memory_operation_.value;

-    const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch);
-    constexpr dim3 blocks = Kernel::BlockSize();
+        using GemmEpilogue = ck_tile::CShuffleEpilogue<
+            ck_tile::CShuffleEpilogueProblem<ADataType,
+                                             BDataType,
+                                             ck_tile::tuple<>,
+                                             AccDataType,
+                                             CDataType,
+                                             ck_tile::tuple<>,
+                                             CLayout,
+                                             ck_tile::element_wise::PassThrough,
+                                             CodegenPipelineProblem::kBlockSize,
+                                             TilePartitioner::MPerBlock,
+                                             TilePartitioner::NPerBlock,
+                                             M_Warp,
+                                             N_Warp,
+                                             M_Warp_Tile,
+                                             N_Warp_Tile,
+                                             K_Warp_Tile,
+                                             CodegenPipelineProblem::TransposeC,
+                                             memory_operation>>;

-    if(!Kernel::IsSupportedArgument(kargs))
+        // ToDo: Will add the codegen part to test different pipeline policies in GEMM.
+        // Now we only use the BlockGemmASmemBSmemCRegV1DefaultPolicy.
+        using Kernel = ck_tile::GemmKernel<TilePartitioner, CodegenGemmPipeline, GemmEpilogue>;
+        auto kargs   = Kernel::MakeKernelArgs(args);
+
+        const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch);
+        constexpr dim3 blocks = Kernel::BlockSize();
+
+        if(!Kernel::IsSupportedArgument(kargs))
+        {
+            throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
+        }
+
+        if(s.log_level_ > 0)
+        {
+            std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
+                      << "shape: " << CodegenGemmShape::GetName() << '\n'
+                      << "problem: " << CodegenPipelineProblem::GetName() << '\n'
+                      << "pipeline: " << CodegenGemmPipeline::GetName() << '\n'
+                      << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
+                      << std::endl;
+        }
+
+        float ave_time =
+            ck_tile::launch_kernel(s,
+                                   ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
+                                       Kernel{}, grids, blocks, 0, kargs));
+
+        return ave_time;
+    };
+
+    if(args.k_batch == 1)
    {
-        throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
+        return Run(MemoryOpSet{});
    }
-
-    if(s.log_level_ > 0)
+    else
    {
-        std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
-                  << "shape: " << CodegenGemmShape::GetName() << '\n'
-                  << "problem: " << CodegenPipelineProblem::GetName() << '\n'
-                  << "pipeline: " << CodegenGemmPipeline::GetName() << '\n'
-                  << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
-                  << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
-                  << std::endl;
+        return Run(MemoryOpAtomicAdd{});
    }
-
-    float ave_time = ck_tile::launch_kernel(
-        s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
-
-    return ave_time;
 }

 #include "run_gemm_example.inc"

 template <typename APrecType, typename BPrecType = APrecType, typename CPrecType = APrecType>
-int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int argc, char* argv[])
+int run_gemm_example_prec_type(std::string a_layout,
+                               std::string b_layout,
+                               ck_tile::ArgParser& arg_parser)
 {
    using Row = ck_tile::tensor_layout::gemm::RowMajor;
    using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
@@ -111,13 +132,13 @@ int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int a
    {
        if(a_layout == "R" && b_layout == "C")
        {
-            return run_gemm_example_with_layouts<APrecType, BPrecType, CPrecType>(
-                argc, argv, Row{}, Col{}, Row{});
+            return run_gemm_example_with_layouts<GemmConfigBase, APrecType, BPrecType, CPrecType>(
+                arg_parser, Row{}, Col{}, Row{});
        }
        else if(a_layout == "C" && b_layout == "C")
        {
-            return run_gemm_example_with_layouts<APrecType, BPrecType, CPrecType>(
-                argc, argv, Col{}, Col{}, Row{});
+            return run_gemm_example_with_layouts<GemmConfigBase, APrecType, BPrecType, CPrecType>(
+                arg_parser, Col{}, Col{}, Row{});
        }
        else
        {
@@ -127,25 +148,25 @@ int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int a
    }
    else
    {
-        if(a_layout == "R" && b_layout == "R")
+        if(a_layout == "R" && b_layout == "C")
        {
-            return run_gemm_example_with_layouts<APrecType, BPrecType, CPrecType>(
-                argc, argv, Row{}, Row{}, Row{});
+            return run_gemm_example_with_layouts<GemmConfigBase, APrecType, BPrecType, CPrecType>(
+                arg_parser, Row{}, Col{}, Row{});
        }
-        else if(a_layout == "R" && b_layout == "C")
+        else if(a_layout == "R" && b_layout == "R")
        {
-            return run_gemm_example_with_layouts<APrecType, BPrecType, CPrecType>(
-                argc, argv, Row{}, Col{}, Row{});
+            return run_gemm_example_with_layouts<GemmConfigBase, APrecType, BPrecType, CPrecType>(
+                arg_parser, Row{}, Row{}, Row{});
        }
        else if(a_layout == "C" && b_layout == "R")
        {
-            return run_gemm_example_with_layouts<APrecType, BPrecType, CPrecType>(
-                argc, argv, Col{}, Row{}, Row{});
+            return run_gemm_example_with_layouts<GemmConfigBase, APrecType, BPrecType, CPrecType>(
+                arg_parser, Col{}, Row{}, Row{});
        }
        else if(a_layout == "C" && b_layout == "C")
        {
-            return run_gemm_example_with_layouts<APrecType, BPrecType, CPrecType>(
-                argc, argv, Col{}, Col{}, Row{});
+            return run_gemm_example_with_layouts<GemmConfigBase, APrecType, BPrecType, CPrecType>(
+                arg_parser, Col{}, Col{}, Row{});
        }
        else
        {
@@ -154,47 +175,67 @@ int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int a
    }
 }

-int run_gemm_example(int argc, char* argv[])
+int run_gemm_example(ck_tile::ArgParser& arg_parser)
 {
-    auto [result, arg_parser] = create_args(argc, argv);
-    if(!result)
-        return -1;
-
    std::string data_type = arg_parser.get_str("prec");
    std::string a_layout  = arg_parser.get_str("a_layout");
    std::string b_layout  = arg_parser.get_str("b_layout");

    if(data_type == "fp16")
    {
-        return run_gemm_example_prec_type<ck_tile::half_t>(a_layout, b_layout, argc, argv);
+        return run_gemm_example_prec_type<ck_tile::half_t>(a_layout, b_layout, arg_parser);
    }
    else if(data_type == "bf16")
    {
-        return run_gemm_example_prec_type<ck_tile::bf16_t>(a_layout, b_layout, argc, argv);
+        return run_gemm_example_prec_type<ck_tile::bf16_t>(a_layout, b_layout, arg_parser);
    }
    else if(data_type == "fp8")
    {
        return run_gemm_example_prec_type<ck_tile::fp8_t, ck_tile::fp8_t, ck_tile::half_t>(
-            a_layout, b_layout, argc, argv);
+            a_layout, b_layout, arg_parser);
    }
    else if(data_type == "bf8")
    {
        return run_gemm_example_prec_type<ck_tile::bf8_t, ck_tile::bf8_t, ck_tile::half_t>(
-            a_layout, b_layout, argc, argv);
+            a_layout, b_layout, arg_parser);
+    }
+    else if(data_type == "i8")
+    {
+        return run_gemm_example_prec_type<ck_tile::int8_t, ck_tile::int8_t, int32_t>(
+            a_layout, b_layout, arg_parser);
    }
-
-#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3)
    else if(data_type == "pk_int4_t")
    {
        // TODO: Add support for bhalf_t ADataType
-        return run_gemm_example_prec_type<ck_tile::half_t, ck_tile::pk_int4_t, ck_tile::half_t>(
-            a_layout, b_layout, argc, argv);
+        if constexpr(GemmConfigBase::Pipeline == CK_TILE_PIPELINE_COMPUTE_V3)
+        {
+            return run_gemm_example_prec_type<ck_tile::half_t, ck_tile::pk_int4_t, ck_tile::half_t>(
+                a_layout, b_layout, arg_parser);
+        }
+        else
+        {
+            throw std::runtime_error("Unsupported data type for this operation !!!");
+        }
    }
-#endif
    else
    {
        throw std::runtime_error("Unsupported data type for this operation !!!");
    }
 }

-int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
+int main(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    try
+    {
+        return !run_gemm_example(arg_parser);
+    }
+    catch(const std::runtime_error& e)
+    {
+        std::cerr << "Runtime error: " << e.what() << '\n';
+        return EXIT_FAILURE;
+    }
+}
--- a/example/ck_tile/03_gemm/gemm_splitk_two_stage_reduce.cpp
+++ b/example/ck_tile/03_gemm/gemm_splitk_two_stage_reduce.cpp
--- a/example/ck_tile/03_gemm/gemm_utils.hpp
+++ b/example/ck_tile/03_gemm/gemm_utils.hpp
@@ -1,4 +1,3 @@
-
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.

@@ -14,78 +13,46 @@
 #define CK_TILE_PIPELINE_COMPUTE_V3 1
 #define CK_TILE_PIPELINE_MEMORY 2
 #define CK_TILE_PIPELINE_COMPUTE_V4 3
+#define CK_TILE_PIPELINE_COMPUTE_V5 4
+#define CK_TILE_PIPELINE_PRESHUFFLE_V1 5
+#define CK_TILE_PIPELINE_PRESHUFFLE_V2 6

-#ifndef CK_TILE_PIPELINE_DEFAULT
-#define CK_TILE_PIPELINE_DEFAULT CK_TILE_PIPELINE_COMPUTE_V3
-#endif
-
-#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
-#define GEMM_PIPELINE ck_tile::GemmPipelineAgBgCrMem
-#define UNIVERSAL_GEMM_PIPELINE ck_tile::BaseGemmPipelineAgBgCrMem
-#define GEMM_PIPELINE_SCHEDULER ck_tile::GemmPipelineScheduler::Interwave
-#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3)
-#define GEMM_PIPELINE ck_tile::GemmPipelineAgBgCrCompV3
-#define UNIVERSAL_GEMM_PIPELINE ck_tile::BaseGemmPipelineAgBgCrCompV3
-#define GEMM_PIPELINE_SCHEDULER ck_tile::GemmPipelineScheduler::Intrawave
-#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V4)
-#define GEMM_PIPELINE ck_tile::GemmPipelineAgBgCrCompV4
-#define UNIVERSAL_GEMM_PIPELINE ck_tile::BaseGemmPipelineAgBgCrCompV4
-#define GEMM_PIPELINE_SCHEDULER ck_tile::GemmPipelineScheduler::Intrawave
-#else
-#error "unsupported CK_TILE_PIPELINE_DEFAULT value"
-#endif
-
-struct GemmConfig
+template <typename PrecType, ck_tile::index_t M_Warp_Tile>
+constexpr ck_tile::index_t get_k_warp_tile()
 {
-#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
-    // Memory friendly for Interwave scheduler
-    static constexpr ck_tile::index_t M_Tile = 128;
-    static constexpr ck_tile::index_t N_Tile = 32;
-    static constexpr ck_tile::index_t K_Tile = 64;
-
-    static constexpr ck_tile::index_t M_Warp = 4;
-    static constexpr ck_tile::index_t N_Warp = 1;
-    static constexpr ck_tile::index_t K_Warp = 1;
-
-    static constexpr ck_tile::index_t M_Warp_Tile = 32;
-    static constexpr ck_tile::index_t N_Warp_Tile = 32;
-    static constexpr ck_tile::index_t K_Warp_Tile = 8;
-
-    static constexpr bool DoubleSmemBuffer = false;
+#if defined(CK_GFX950_SUPPORT)
+    constexpr bool is_8bit_float =
+        std::is_same_v<PrecType, ck_tile::fp8_t> || std::is_same_v<PrecType, ck_tile::bf8_t>;
+    if constexpr(M_Warp_Tile == 32)
+        return is_8bit_float ? 64 : 16;
+    else
+        return is_8bit_float ? 128 : 32;
+#else
+    if constexpr(M_Warp_Tile == 32)
+        return 16;
+    else
+        return 32;
 #endif
-#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3)
-    // Compute friendly for Intrawave scheduler
-    static constexpr ck_tile::index_t M_Tile = 256;
-    static constexpr ck_tile::index_t N_Tile = 256;
-    static constexpr ck_tile::index_t K_Tile = 64;
+}

-    static constexpr ck_tile::index_t M_Warp = 2;
-    static constexpr ck_tile::index_t N_Warp = 2;
-    static constexpr ck_tile::index_t K_Warp = 1;
-
-    static constexpr ck_tile::index_t M_Warp_Tile = 32;
-    static constexpr ck_tile::index_t N_Warp_Tile = 32;
-    static constexpr ck_tile::index_t K_Warp_Tile = 16;
-
-    static constexpr bool DoubleSmemBuffer = false;
-#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V4)
-    // Compute friendly for Intrawave scheduler
-    // Using the ping pong reader in the lds level
-    static constexpr ck_tile::index_t M_Tile = 256;
-    static constexpr ck_tile::index_t N_Tile = 256;
-    static constexpr ck_tile::index_t K_Tile = 32;
-
-    static constexpr ck_tile::index_t M_Warp = 2;
-    static constexpr ck_tile::index_t N_Warp = 2;
-    static constexpr ck_tile::index_t K_Warp = 1;
-
-    static constexpr ck_tile::index_t M_Warp_Tile = 32;
-    static constexpr ck_tile::index_t N_Warp_Tile = 32;
-    static constexpr ck_tile::index_t K_Warp_Tile = 16;
-
-    static constexpr bool DoubleSmemBuffer = true;
+template <typename PrecType, ck_tile::index_t M_Warp_Tile>
+constexpr ck_tile::index_t get_k_warp_tile_flatmm()
+{
+#if defined(CK_GFX950_SUPPORT)
+    if constexpr(M_Warp_Tile == 32)
+        return sizeof(PrecType) == 2 ? 16 : 64;
+    else
+        return sizeof(PrecType) == 2 ? 32 : 128;
+#else
+    if constexpr(M_Warp_Tile == 32)
+        return sizeof(PrecType) == 2 ? 16 : 32;
+    else
+        return sizeof(PrecType) == 2 ? 32 : 64;
 #endif
+}

+struct GemmConfigBase
+{
    static constexpr bool kPadM = false;
    static constexpr bool kPadN = false;
    static constexpr bool kPadK = false;
@@ -93,11 +60,241 @@ struct GemmConfig
    static constexpr bool PermuteA = false;
    static constexpr bool PermuteB = false;

-    static constexpr bool TransposeC = false;
+    static constexpr bool TransposeC            = false;
+    static constexpr bool UseStructuredSparsity = false;

    static constexpr int kBlockPerCu                         = 1;
    static constexpr ck_tile::index_t TileParitionerGroupNum = 8;
    static constexpr ck_tile::index_t TileParitionerM01      = 4;
+    static constexpr auto Scheduler                 = ck_tile::GemmPipelineScheduler::Intrawave;
+    static constexpr ck_tile::index_t Pipeline      = CK_TILE_PIPELINE_COMPUTE_V3;
+    static constexpr ck_tile::index_t NumWaveGroups = 1;
+    static constexpr bool Preshuffle                = false;
+};
+
+template <typename PrecType>
+struct GemmConfigMemoryInterwave : public GemmConfigBase
+{
+    // Memory friendly for Interwave scheduler
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 32;
+    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 4;
+    static constexpr ck_tile::index_t N_Warp = 1;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = sizeof(PrecType) == 2 ? 8 : 16;
+
+    static constexpr bool DoubleSmemBuffer     = false;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_MEMORY;
+    static constexpr auto Scheduler            = ck_tile::GemmPipelineScheduler::Interwave;
+};
+
+template <typename PrecType>
+struct GemmConfigMemoryIntrawave : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 32;
+    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 4;
+    static constexpr ck_tile::index_t N_Warp = 1;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = sizeof(PrecType) == 2 ? 8 : 16;
+
+    static constexpr bool DoubleSmemBuffer     = false;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_MEMORY;
+};
+
+template <typename PrecType>
+struct GemmConfigComputeV3 : public GemmConfigBase
+{
+    // Compute V3 only support Intrawave scheduler
+    static constexpr ck_tile::index_t M_Tile = 16;
+    static constexpr ck_tile::index_t N_Tile = 64;
+    static constexpr ck_tile::index_t K_Tile = 256 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 1;
+    static constexpr ck_tile::index_t N_Warp = 4;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 16;
+    static constexpr ck_tile::index_t N_Warp_Tile = 16;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
+
+    static constexpr bool DoubleSmemBuffer     = false;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V3;
+};
+
+template <typename PrecType>
+struct GemmConfigComputeV3_1 : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 256;
+    static constexpr ck_tile::index_t N_Tile = 256;
+    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 2;
+    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
+
+    static constexpr bool DoubleSmemBuffer     = false;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V3;
+};
+
+template <typename PrecType>
+struct GemmConfigComputeV3_2 : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 128;
+    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 2;
+    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 16;
+    static constexpr ck_tile::index_t N_Warp_Tile = 16;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
+
+    static constexpr bool DoubleSmemBuffer     = false;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V3;
+
+    static constexpr int kBlockPerCu = 2;
+};
+
+template <typename PrecType>
+struct GemmConfigComputeV3_WMMA : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 128;
+    static constexpr ck_tile::index_t K_Tile = 64 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 4;
+    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 16;
+    static constexpr ck_tile::index_t N_Warp_Tile = 16;
+    static constexpr ck_tile::index_t K_Warp_Tile = 16;
+
+    static constexpr bool DoubleSmemBuffer     = false;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V3;
+
+    static constexpr int kBlockPerCu = 2;
+};
+
+template <typename PrecType>
+struct GemmConfigComputeV4 : public GemmConfigBase
+{
+    // Compute V4 only support Intrawave scheduler
+    // Using the ping pong reader in the lds level
+    static constexpr ck_tile::index_t M_Tile = 256;
+    static constexpr ck_tile::index_t N_Tile = 256;
+    static constexpr ck_tile::index_t K_Tile = 64 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 2;
+    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
+
+    static constexpr bool DoubleSmemBuffer     = true;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V4;
+};
+
+template <typename PrecType>
+struct GemmConfigComputeV4_1 : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 256;
+    static constexpr ck_tile::index_t N_Tile = 256;
+    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 2;
+    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
+
+    static constexpr bool DoubleSmemBuffer     = true;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V4;
+};
+
+template <typename PrecType>
+struct GemmConfigComputeV5 : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 128;
+    static constexpr ck_tile::index_t K_Tile = 64 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 1;
+    static constexpr ck_tile::index_t N_Warp = 1;
+    static constexpr ck_tile::index_t K_Warp = 2;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
+
+    static constexpr bool DoubleSmemBuffer               = false;
+    static constexpr ck_tile::index_t Pipeline           = CK_TILE_PIPELINE_COMPUTE_V5;
+    static constexpr ck_tile::index_t NumWaNumWaveGroups = 2;
+};
+
+template <typename PrecType>
+struct GemmConfigPreshuffleDecode : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 16;
+    static constexpr ck_tile::index_t N_Tile = 64;
+    static constexpr ck_tile::index_t K_Tile = 256 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 1;
+    static constexpr ck_tile::index_t N_Warp = 4;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 16;
+    static constexpr ck_tile::index_t N_Warp_Tile = 16;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile_flatmm<PrecType, M_Warp_Tile>();
+
+    static constexpr int kBlockPerCu           = 1;
+    static constexpr auto Scheduler            = ck_tile::GemmPipelineScheduler::Default;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_PRESHUFFLE_V2;
+    static constexpr bool Preshuffle           = true;
+    static constexpr bool DoubleSmemBuffer     = true;
+};
+
+template <typename PrecType>
+struct GemmConfigPreshufflePrefill : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 128;
+    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 1;
+    static constexpr ck_tile::index_t N_Warp = 4;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 16;
+    static constexpr ck_tile::index_t N_Warp_Tile = 16;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile_flatmm<PrecType, M_Warp_Tile>();
+
+    static constexpr int kBlockPerCu           = 2;
+    static constexpr auto Scheduler            = ck_tile::GemmPipelineScheduler::Default;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_PRESHUFFLE_V2;
+    static constexpr bool Preshuffle           = true;
+    static constexpr bool DoubleSmemBuffer     = true;
 };

 template <typename ADataType, typename BDataType = ADataType, typename CDataType = ADataType>
@@ -149,6 +346,15 @@ struct GemmTypeConfig<ck_tile::half_t, ck_tile::pk_int4_t, ck_tile::half_t>
    using CDataType   = ck_tile::half_t;
 };

+template <>
+struct GemmTypeConfig<ck_tile::int8_t, ck_tile::int8_t, int32_t>
+{
+    using ADataType   = ck_tile::int8_t;
+    using BDataType   = ck_tile::int8_t;
+    using AccDataType = int32_t;
+    using CDataType   = int32_t;
+};
+
 template <typename T>
 struct DataTypeTraits;

@@ -164,6 +370,12 @@ struct DataTypeTraits<double>
    static constexpr const char* name = "fp64";
 };

+template <>
+struct DataTypeTraits<int32_t>
+{
+    static constexpr const char* name = "int32";
+};
+
 template <>
 struct DataTypeTraits<ck_tile::half_t>
 {
@@ -194,6 +406,71 @@ struct DataTypeTraits<ck_tile::pk_int4_t>
    static constexpr const char* name = "pk_int4_t";
 };

+template <>
+struct DataTypeTraits<ck_tile::int8_t>
+{
+    static constexpr const char* name = "int8";
+};
+
+template <ck_tile::index_t PipelineId>
+struct PipelineTypeTraits;
+
+template <>
+struct PipelineTypeTraits<CK_TILE_PIPELINE_MEMORY>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::GemmPipelineAgBgCrMem<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrMem<PipelineProblem>;
+};
+
+template <>
+struct PipelineTypeTraits<CK_TILE_PIPELINE_COMPUTE_V3>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV3<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV3<PipelineProblem>;
+};
+
+template <>
+struct PipelineTypeTraits<CK_TILE_PIPELINE_COMPUTE_V4>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV4<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV4<PipelineProblem>;
+};
+
+template <>
+struct PipelineTypeTraits<CK_TILE_PIPELINE_COMPUTE_V5>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV5<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV5<PipelineProblem>;
+};
+
+template <>
+struct PipelineTypeTraits<CK_TILE_PIPELINE_PRESHUFFLE_V1>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::WeightPreshufflePipelineAGmemBGmemCRegV1<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline =
+        ck_tile::BaseWeightPreshufflePipelineAGmemBGmemCRegV1<PipelineProblem>;
+};
+
+template <>
+struct PipelineTypeTraits<CK_TILE_PIPELINE_PRESHUFFLE_V2>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::WeightPreshufflePipelineAGmemBGmemCRegV2<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline =
+        ck_tile::BaseWeightPreshufflePipelineAGmemBGmemCRegV2<PipelineProblem>;
+};
+
 auto create_args(int argc, char* argv[])
 {
    ck_tile::ArgParser arg_parser;
@@ -212,11 +489,31 @@ auto create_args(int argc, char* argv[])
        .insert("repeat", "100", "number of iterations to benchmark the kernel")
        .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
        .insert("split_k", "1", "splitK value")
-        .insert("init", "0", "0:random, 1:linear, 2:constant(1)");
+        .insert("init", "0", "0:random, 1:linear, 2:constant(1)")
+        .insert("persistent", "0", "0:non-persistent, 1:persistent")
+        .insert("flush_cache", "true", "flush cache before running the kernel, defaults to true")
+        .insert("rotating_count", "1000", "rotating count, defaults to 1000");

    bool result = arg_parser.parse(argc, argv);
    return std::make_tuple(result, arg_parser);
 }

+// Type aliases for memory operation integral constants
+using MemoryOpSet =
+    std::integral_constant<ck_tile::memory_operation_enum, ck_tile::memory_operation_enum::set>;
+using MemoryOpAtomicAdd = std::integral_constant<ck_tile::memory_operation_enum,
+                                                 ck_tile::memory_operation_enum::atomic_add>;
+
 // host API
-float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s);
+template <typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename CLayout,
+          bool Persistent = false,
+          typename CDEElementWise>
+float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s);
--- a/example/ck_tile/03_gemm/gemm_weight_preshuffle.cpp
+++ b/example/ck_tile/03_gemm/gemm_weight_preshuffle.cpp
@@ -0,0 +1,290 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <hip/hip_runtime.h>
+
+#include <cstring>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <tuple>
+
+#include "ck_tile/host.hpp"
+#include "gemm_utils.hpp"
+#include "run_gemm_example.inc"
+
+template <typename GemmConfig,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          bool Persistent,
+          typename CDEElementWise>
+float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
+
+{
+    using GemmShape = ck_tile::TileGemmShape<
+        ck_tile::sequence<GemmConfig::M_Tile, GemmConfig::N_Tile, GemmConfig::K_Tile>,
+        ck_tile::sequence<GemmConfig::M_Warp, GemmConfig::N_Warp, GemmConfig::K_Warp>,
+        ck_tile::
+            sequence<GemmConfig::M_Warp_Tile, GemmConfig::N_Warp_Tile, GemmConfig::K_Warp_Tile>,
+        GemmConfig::PermuteA,
+        GemmConfig::PermuteB>;
+
+    using TilePartitioner =
+        ck_tile::GemmSpatiallyLocalTilePartitioner<GemmShape,
+                                                   GemmConfig::TileParitionerGroupNum,
+                                                   GemmConfig::TileParitionerM01>;
+
+    using Traits = ck_tile::TileGemmTraits<GemmConfig::kPadM,
+                                           GemmConfig::kPadN,
+                                           GemmConfig::kPadK,
+                                           ALayout,
+                                           BLayout,
+                                           ELayout,
+                                           GemmConfig::NumWaveGroups>;
+
+    using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<GemmConfig::kPadM,
+                                                                 GemmConfig::kPadN,
+                                                                 GemmConfig::kPadK,
+                                                                 GemmConfig::DoubleSmemBuffer,
+                                                                 ALayout,
+                                                                 BLayout,
+                                                                 ELayout,
+                                                                 GemmConfig::TransposeC,
+                                                                 GemmConfig::UseStructuredSparsity,
+                                                                 Persistent,
+                                                                 GemmConfig::NumWaveGroups,
+                                                                 GemmConfig::Preshuffle>;
+    using GemmPipelineProblem =
+        ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
+
+    using BaseGemmPipeline = typename PipelineTypeTraits<
+        GemmConfig::Pipeline>::template UniversalGemmPipeline<GemmPipelineProblem>;
+
+    const ck_tile::index_t k_grain     = args.k_batch * GemmConfig::K_Tile;
+    const ck_tile::index_t K_split     = (args.K + k_grain - 1) / k_grain * GemmConfig::K_Tile;
+    const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(K_split);
+    const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
+    const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
+    float ave_time{0};
+
+    const auto Run = [&](const auto has_hot_loop_,
+                         const auto tail_number_,
+                         const auto memory_operation_) {
+        constexpr bool has_hot_loop_v   = has_hot_loop_.value;
+        constexpr auto tail_number_v    = tail_number_.value;
+        constexpr auto scheduler        = GemmConfig::Scheduler;
+        constexpr auto memory_operation = memory_operation_.value;
+
+        using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                                           BDataType,
+                                                                           AccDataType,
+                                                                           GemmShape,
+                                                                           GemmUniversalTraits,
+                                                                           scheduler,
+                                                                           has_hot_loop_v,
+                                                                           tail_number_v>;
+
+        using GemmPipeline = typename PipelineTypeTraits<
+            GemmConfig::Pipeline>::template GemmPipeline<UniversalGemmProblem>;
+
+        using GemmEpilogue = ck_tile::CShuffleEpilogue<
+            ck_tile::CShuffleEpilogueProblem<ADataType,
+                                             BDataType,
+                                             DsDataType,
+                                             AccDataType,
+                                             CDataType,
+                                             DsLayout,
+                                             ELayout,
+                                             CDEElementWise,
+                                             UniversalGemmProblem::kBlockSize,
+                                             TilePartitioner::MPerBlock,
+                                             TilePartitioner::NPerBlock,
+                                             GemmConfig::M_Warp,
+                                             GemmConfig::N_Warp,
+                                             GemmConfig::M_Warp_Tile,
+                                             GemmConfig::N_Warp_Tile,
+                                             GemmConfig::K_Warp_Tile,
+                                             UniversalGemmProblem::TransposeC,
+                                             memory_operation,
+                                             GemmConfig::NumWaveGroups>>;
+        using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
+        auto kargs   = Kernel::MakeKernelArgs(args);
+
+        dim3 grids;
+        if constexpr(Persistent)
+        {
+            grids = Kernel::MaxOccupancyGridSize(s);
+        }
+        else
+        {
+            grids = Kernel::GridSize(args.M, args.N, args.k_batch);
+        }
+        constexpr dim3 blocks = Kernel::BlockSize();
+
+        if(!Kernel::IsSupportedArgument(kargs))
+        {
+            throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
+        }
+
+        if(s.log_level_ > 0)
+        {
+            std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
+                      << "shape: " << GemmShape::GetName() << '\n'
+                      << "problem: " << UniversalGemmProblem::GetName() << '\n'
+                      << "pipeline: " << GemmPipeline::GetName() << '\n'
+                      << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
+                      << ", kBlockPerCu: {" << GemmConfig::kBlockPerCu << "}" << std::endl;
+        }
+        if(s.flush_cache_)
+        {
+            std::cout << "Flushing cache..." << std::endl;
+
+            ck_tile::HostTensor<ADataType> a_m(ck_tile::host_tensor_descriptor(
+                args.M, args.K, args.stride_A, is_row_major(ALayout{})));
+            ck_tile::HostTensor<BDataType> b_n(ck_tile::host_tensor_descriptor(
+                args.K, args.N, args.stride_B, is_row_major(BLayout{})));
+
+            auto size_a_buffer = a_m.get_element_space_size_in_bytes();
+            auto size_b_buffer = b_n.get_element_space_size_in_bytes();
+
+            ck_tile::RotatingMemWrapper<ADataType, BDataType> rotating_mem(
+                kargs.as_ptr[0], kargs.bs_ptr[0], s.rotating_count_, size_a_buffer, size_b_buffer);
+            rotating_mem.Print();
+
+            auto run_flush_cache = [&]() {
+                // flush icache
+                ck_tile::flush_icache();
+                // rotating mem
+                rotating_mem.Next();
+                // clear c mem
+                if(args.k_batch > 1)
+                    hipGetErrorString(hipMemsetAsync(
+                        args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
+            };
+            ave_time = ck_tile::launch_kernel_time_mask(
+                s,
+                run_flush_cache,
+                ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
+                    Kernel{}, grids, blocks, 0, kargs));
+        }
+        else
+        {
+            ave_time =
+                ck_tile::launch_kernel(s,
+                                       ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
+                                           Kernel{}, grids, blocks, 0, kargs));
+        }
+        return ave_time;
+    };
+
+    const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
+        if(args.k_batch == 1)
+        {
+            Run(has_hot_loop_,
+                tail_number_,
+                ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                           ck_tile::memory_operation_enum::set>{});
+        }
+        else
+        {
+            Run(has_hot_loop_,
+                tail_number_,
+                ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                           ck_tile::memory_operation_enum::atomic_add>{});
+        }
+    };
+
+    BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);
+    return ave_time;
+}
+
+template <typename GemmConfig,
+          typename APrecType,
+          typename BPrecType = APrecType,
+          typename CPrecType = APrecType>
+int run_gemm_example_prec_type(std::string a_layout,
+                               std::string b_layout,
+                               ck_tile::ArgParser& arg_parser)
+{
+    using Row       = ck_tile::tensor_layout::gemm::RowMajor;
+    using Col       = ck_tile::tensor_layout::gemm::ColumnMajor;
+    bool preshuffle = GemmConfig::Preshuffle;
+
+    if(preshuffle && (a_layout != "R" || b_layout != "C"))
+    {
+        throw std::runtime_error(
+            "Preshuffle is supported only for A(Row major), B(column major) input matrices!");
+    }
+
+    if(a_layout == "R" && b_layout == "C")
+    {
+        return run_gemm_example_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
+            arg_parser, Row{}, Col{}, Row{});
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported memory layout for the input matrices!");
+    }
+}
+
+template <template <typename PreType> typename GemmConfig>
+int run_gemm_example(ck_tile::ArgParser& arg_parser)
+{
+    std::string data_type = arg_parser.get_str("prec");
+    std::string a_layout  = arg_parser.get_str("a_layout");
+    std::string b_layout  = arg_parser.get_str("b_layout");
+
+    if(data_type == "fp16")
+    {
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::half_t>, ck_tile::half_t>(
+            a_layout, b_layout, arg_parser);
+    }
+    else if(data_type == "bf16")
+    {
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::half_t>, ck_tile::bf16_t>(
+            a_layout, b_layout, arg_parser);
+    }
+    else if(data_type == "fp8")
+    {
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::fp8_t>,
+                                          ck_tile::fp8_t,
+                                          ck_tile::fp8_t,
+                                          ck_tile::half_t>(a_layout, b_layout, arg_parser);
+    }
+    else if(data_type == "bf8")
+    {
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::bf8_t>,
+                                          ck_tile::bf8_t,
+                                          ck_tile::bf8_t,
+                                          ck_tile::half_t>(a_layout, b_layout, arg_parser);
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported data type for this operation !!!");
+    }
+}
+
+int main(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    try
+    {
+        return !run_gemm_example<GemmConfigPreshuffleDecode>(arg_parser);
+    }
+    catch(const std::runtime_error& e)
+    {
+        std::cerr << "Caught runtime error: " << e.what() << '\n';
+        return EXIT_FAILURE;
+    }
+}
--- a/example/ck_tile/03_gemm/run_gemm_example.inc
+++ b/example/ck_tile/03_gemm/run_gemm_example.inc
@@ -30,7 +30,8 @@ auto calculate_rtol_atol(const ck_tile::index_t K,
    return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
 }

-template <typename Tensor,
+template <typename GemmConfig,
+          typename Tensor,
          typename ADataType,
          typename BDataType,
          typename AccDataType,
@@ -55,18 +56,20 @@ void permute_tensor_b(Tensor& tensor)
                                                                 ALayout,
                                                                 BLayout,
                                                                 CLayout,
-                                                                 GemmConfig::TransposeC>;
+                                                                 GemmConfig::TransposeC,
+                                                                 GemmConfig::UseStructuredSparsity>;

    using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
                                                                       BDataType,
                                                                       AccDataType,
                                                                       GemmShape,
                                                                       GemmUniversalTraits,
-                                                                       GEMM_PIPELINE_SCHEDULER,
+                                                                       GemmConfig::Scheduler,
                                                                       true,
                                                                       ck_tile::TailNumber::Full>;

-    using GemmPipeline = GEMM_PIPELINE<UniversalGemmProblem>;
+    using GemmPipeline = typename PipelineTypeTraits<GemmConfig::Pipeline>::template GemmPipeline<
+        UniversalGemmProblem>;

    const ck_tile::index_t K  = tensor.get_length(0);
    const ck_tile::index_t N  = tensor.get_length(1);
@@ -143,13 +146,31 @@ void permute_vectors_i4x4_b(Tensor& tensor)
    }
 }

-template <typename ADataType,
+template <typename GemmConfig,
+          typename ADataType,
          typename BDataType,
+          typename DsDataType,
          typename AccDataType,
          typename CDataType,
          typename ALayout,
          typename BLayout,
-          typename CLayout>
+          typename DsLayout,
+          typename CLayout,
+          bool Persistent,
+          typename CDEElementWise = ck_tile::element_wise::PassThrough>
+float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s);
+
+template <typename GemmConfig,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename CLayout,
+          typename CDEElementWise = ck_tile::element_wise::PassThrough>
 float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
                  ck_tile::DeviceMem& b_k_n_dev_buf,
                  ck_tile::DeviceMem& c_m_n_dev_buf,
@@ -161,23 +182,59 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
                  ck_tile::index_t stride_C,
                  ck_tile::index_t kbatch,
                  int n_warmup,
-                  int n_repeat)
+                  int n_repeat,
+                  bool persistent,
+                  bool flush_cache,
+                  int rotating_count)
 {
-    ck_tile::GemmHostArgs args;
-    args.a_ptr    = a_m_k_dev_buf.GetDeviceBuffer();
-    args.b_ptr    = b_k_n_dev_buf.GetDeviceBuffer();
-    args.c_ptr    = c_m_n_dev_buf.GetDeviceBuffer();
-    args.k_batch  = kbatch;
-    args.M        = M;
-    args.N        = N;
-    args.K        = K;
-    args.stride_A = stride_A;
-    args.stride_B = stride_B;
-    args.stride_C = stride_C;
+    ck_tile::GemmHostArgs args = {a_m_k_dev_buf.GetDeviceBuffer(),
+                                  b_k_n_dev_buf.GetDeviceBuffer(),
+                                  c_m_n_dev_buf.GetDeviceBuffer(),
+                                  kbatch,
+                                  M,
+                                  N,
+                                  K,
+                                  stride_A,
+                                  stride_B,
+                                  stride_C};

-    float ave_time =
-        gemm_calc<ADataType, BDataType, AccDataType, CDataType, ALayout, BLayout, CLayout>(
-            args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});
+    float ave_time;
+    if(persistent)
+    {
+        ave_time = gemm<GemmConfig,
+                        ADataType,
+                        BDataType,
+                        DsDataType,
+                        AccDataType,
+                        CDataType,
+                        ALayout,
+                        BLayout,
+                        DsLayout,
+                        CLayout,
+                        true,
+                        CDEElementWise>(
+            args,
+            ck_tile::stream_config{
+                nullptr, true, 1, n_warmup, n_repeat, true, flush_cache, rotating_count});
+    }
+    else
+    {
+        ave_time = gemm<GemmConfig,
+                        ADataType,
+                        BDataType,
+                        DsDataType,
+                        AccDataType,
+                        CDataType,
+                        ALayout,
+                        BLayout,
+                        DsLayout,
+                        CLayout,
+                        false,
+                        CDEElementWise>(
+            args,
+            ck_tile::stream_config{
+                nullptr, true, 1, n_warmup, n_repeat, true, flush_cache, rotating_count});
+    }

    std::size_t flop = std::size_t(2) * M * N * K;
    std::size_t num_byte =
@@ -185,33 +242,67 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
    float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
    float gb_per_sec = num_byte / 1.E6 / ave_time;

-    std::cout << "Run Gemm kernel with M =" << M << " N =" << N << " K =" << K
-              << " StrideA =" << stride_A << " StrideB =" << stride_B << " StrideC =" << stride_C
-              << " A_Layout =" << ALayout::name << " B_Layout =" << BLayout::name
-              << " C_Layout =" << CLayout::name << " A Type = " << DataTypeTraits<ADataType>::name
-              << " B Type = " << DataTypeTraits<BDataType>::name
-              << " C Type = " << DataTypeTraits<CDataType>::name << " : " << ave_time << " ms, "
-              << tflops << " TFlops, " << gb_per_sec << " GB/s, " << std::endl;
+    std::cout << "Run Gemm kernel with \n M=" << M << " N=" << N << " K=" << K
+              << " StrideA=" << stride_A << " StrideB=" << stride_B << " StrideC=" << stride_C
+              << " A_Layout=" << ALayout::name << " B_Layout =" << BLayout::name
+              << " C_Layout=" << CLayout::name << " A_Type=" << DataTypeTraits<ADataType>::name
+              << " B_Type=" << DataTypeTraits<BDataType>::name
+              << " C_Type=" << DataTypeTraits<CDataType>::name
+              << " StructuredSparsity=" << (GemmConfig::UseStructuredSparsity ? "on" : "off")
+              << " Persistent=" << (persistent ? "on" : "off") << " : \n"
+              << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << std::endl;

    return ave_time;
 }

-template <typename ADataType,
+template <typename GemmConfig, typename T>
+auto shuffle_b(const ck_tile::HostTensor<T>& t)
+{
+    assert(t.get_lengths().size() == 2);
+    int n_                = t.get_lengths()[1];
+    int k_                = t.get_lengths()[0];
+    constexpr int divisor = GemmConfig::N_Warp_Tile == 32 ? 2 : 4;
+    ck_tile::HostTensor<T> t_view({n_ / GemmConfig::N_Warp_Tile,
+                                   GemmConfig::N_Warp_Tile,
+                                   k_ / GemmConfig::K_Warp_Tile,
+                                   divisor,
+                                   GemmConfig::K_Warp_Tile / divisor});
+    std::copy(t.begin(), t.end(), t_view.begin());
+    return ck_tile::reference_permute(t_view, {0, 2, 3, 1, 4});
+}
+
+template <typename CDataType>
+bool do_verify(const ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
+               const ck_tile::HostTensor<CDataType>& c_m_n_ref,
+               const ck_tile::tuple<double, double>& rtol_atol,
+               const char* variant)
+{
+    bool pass = ck_tile::check_err(c_m_n_dev_result,
+                                   c_m_n_ref,
+                                   "Error: Incorrect results!",
+                                   rtol_atol.at(ck_tile::number<0>{}),
+                                   rtol_atol.at(ck_tile::number<1>{}));
+
+    std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
+              << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{}) << std::endl;
+    std::cout << "The " << variant << " verification result is:" << (pass ? "correct" : "fail")
+              << std::endl;
+    return pass;
+}
+
+template <typename GemmConfig,
+          typename ADataType,
          typename BDataType = ADataType,
          typename CDataType = ADataType,
          typename ALayout,
          typename BLayout,
          typename CLayout>
-int run_gemm_example_with_layouts(int argc,
-                                  char* argv[],
+int run_gemm_example_with_layouts(ck_tile::ArgParser& arg_parser,
                                  const ALayout a_layout                  = ALayout{},
                                  const BLayout b_layout                  = BLayout{},
                                  [[maybe_unused]] const CLayout c_layout = CLayout{})
 {
-    auto [result, arg_parser] = create_args(argc, argv);
-    if(!result)
-        return -1;
-
    using AccDataType = typename GemmTypeConfig<ADataType, BDataType, CDataType>::AccDataType;

    ck_tile::index_t M = arg_parser.get_int("m");
@@ -226,6 +317,11 @@ int run_gemm_example_with_layouts(int argc,
    int n_warmup                 = arg_parser.get_int("warmup");
    int n_repeat                 = arg_parser.get_int("repeat");
    ck_tile::index_t init_method = arg_parser.get_int("init");
+    bool persistent              = arg_parser.get_int("persistent");
+    bool flush_cache             = arg_parser.get_bool("flush_cache");
+    int rotating_count           = arg_parser.get_int("rotating_count");
+
+    const bool preshuffle = GemmConfig::Preshuffle;

    stride_A = ck_tile::get_default_stride(M, K, stride_A, is_row_major(a_layout));
    stride_B = ck_tile::get_default_stride(K, N, stride_B, is_row_major(b_layout));
@@ -240,8 +336,16 @@ int run_gemm_example_with_layouts(int argc,

    if(init_method == 0)
    {
-        ck_tile::FillUniformDistribution<ADataType>{-5.f, 5.f}(a_m_k);
-        ck_tile::FillUniformDistribution<BDataType>{-5.f, 5.f}(b_k_n);
+        if constexpr(preshuffle)
+        {
+            ck_tile::FillUniformDistribution<ADataType>{-.5f, .5f}(a_m_k);
+            ck_tile::FillUniformDistribution<BDataType>{-.5f, .5f}(b_k_n);
+        }
+        else
+        {
+            ck_tile::FillUniformDistribution<ADataType>{-5.f, 5.f}(a_m_k);
+            ck_tile::FillUniformDistribution<BDataType>{-5.f, 5.f}(b_k_n);
+        }
    }
    else if(init_method == 1)
    {
@@ -250,8 +354,8 @@ int run_gemm_example_with_layouts(int argc,
    }
    else if(init_method == 2)
    {
-        ck_tile::FillConstant<ADataType>{static_cast<ADataType>(1)}(a_m_k);
-        ck_tile::FillConstant<BDataType>{static_cast<BDataType>(1)}(b_k_n);
+        ck_tile::FillUniformDistribution<ADataType>{1.f, 1.f}(a_m_k);
+        ck_tile::FillUniformDistribution<BDataType>{1.f, 1.f}(b_k_n);
    }
    else
    {
@@ -259,82 +363,101 @@ int run_gemm_example_with_layouts(int argc,
        b_k_n.SetZero();
    }

+    if(!preshuffle && GemmConfig::UseStructuredSparsity)
+    {
+        ck_tile::AdjustToStructuredSparsity<ADataType>{}(a_m_k);
+    }
+
    ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes());
    ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes());
    ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes());

    static_assert(!GemmConfig::PermuteA, "Not implemented");
-    if constexpr(std::is_same_v<BDataType, ck_tile::pk_int4_t>)
+
+    if constexpr(preshuffle)
    {
-        // Permute vector pk_i4x4 data for device implementation
-        ck_tile::HostTensor<BDataType> b_k_n_dev = b_k_n;
-        if constexpr(GemmConfig::PermuteB)
-        {
-            permute_tensor_b<decltype(b_k_n_dev),
-                             ADataType,
-                             BDataType,
-                             AccDataType,
-                             CDataType,
-                             ALayout,
-                             BLayout,
-                             CLayout>(b_k_n_dev);
-        }
-        permute_vectors_i4x4_b(b_k_n_dev);
-        b_k_n_dev_buf.ToDevice(b_k_n_dev.data());
+        ck_tile::HostTensor<BDataType> b_shuffle_host = shuffle_b<GemmConfig>(b_k_n);
+        // shuffled buffer B for device implementation
+        b_k_n_dev_buf.ToDevice(b_shuffle_host.data());
    }
    else
    {
-        if constexpr(GemmConfig::PermuteB)
+        if constexpr(std::is_same_v<BDataType, ck_tile::pk_int4_t>)
        {
-            std::cout << "Permute for this DataType is not implemented." << std::endl;
-            return false;
+            // Permute vector pk_i4x4 data for device implementation
+            ck_tile::HostTensor<BDataType> b_k_n_dev = b_k_n;
+            if constexpr(GemmConfig::PermuteB)
+            {
+                permute_tensor_b<GemmConfig,
+                                 decltype(b_k_n_dev),
+                                 ADataType,
+                                 BDataType,
+                                 AccDataType,
+                                 CDataType,
+                                 ALayout,
+                                 BLayout,
+                                 CLayout>(b_k_n_dev);
+            }
+            permute_vectors_i4x4_b(b_k_n_dev);
+            b_k_n_dev_buf.ToDevice(b_k_n_dev.data());
+        }
+        else
+        {
+            if constexpr(GemmConfig::PermuteB)
+            {
+                std::cout << "Permute for this DataType is not implemented." << std::endl;
+                return false;
+            }
+            b_k_n_dev_buf.ToDevice(b_k_n.data());
        }
-        b_k_n_dev_buf.ToDevice(b_k_n.data());
    }

    a_m_k_dev_buf.ToDevice(a_m_k.data());
    c_m_n_dev_buf.SetZero();
    c_m_n_dev_result.SetZero();

-    invoke_gemm<ADataType, BDataType, AccDataType, CDataType, ALayout, BLayout, CLayout>(
-        a_m_k_dev_buf,
-        b_k_n_dev_buf,
-        c_m_n_dev_buf,
-        M,
-        N,
-        K,
-        stride_A,
-        stride_B,
-        stride_C,
-        kbatch,
-        n_warmup,
-        n_repeat);
+    invoke_gemm<GemmConfig,
+                ADataType,
+                BDataType,
+                ck_tile::tuple<>,
+                AccDataType,
+                CDataType,
+                ALayout,
+                BLayout,
+                ck_tile::tuple<>,
+                CLayout>(a_m_k_dev_buf,
+                         b_k_n_dev_buf,
+                         c_m_n_dev_buf,
+                         M,
+                         N,
+                         K,
+                         stride_A,
+                         stride_B,
+                         stride_C,
+                         kbatch,
+                         n_warmup,
+                         n_repeat,
+                         persistent,
+                         flush_cache,
+                         rotating_count);

    c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
    bool pass = true;

+    // memory on host to store gpu reference result
+    ck_tile::HostTensor<CDataType> c_m_n_ref(
+        ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
+    c_m_n_ref.SetZero();
+
    if(arg_parser.get_int("v") == 1)
    {
-        ck_tile::HostTensor<CDataType> c_m_n_host_ref(
-            ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
-        c_m_n_host_ref.SetZero();
-
        ck_tile::reference_gemm<ADataType, BDataType, AccDataType, CDataType>(
-            a_m_k, b_k_n, c_m_n_host_ref);
+            a_m_k, b_k_n, c_m_n_ref);
        const float max_accumulated_value =
-            *std::max_element(c_m_n_host_ref.mData.begin(), c_m_n_host_ref.mData.end());
+            *std::max_element(c_m_n_ref.mData.begin(), c_m_n_ref.mData.end());
        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
            K, kbatch, max_accumulated_value);
-        pass = ck_tile::check_err(c_m_n_dev_result,
-                                  c_m_n_host_ref,
-                                  "Error: Incorrect results!",
-                                  rtol_atol.at(ck_tile::number<0>{}),
-                                  rtol_atol.at(ck_tile::number<1>{}));
-
-        std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
-                  << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
-                  << std::endl;
-        std::cout << "The CPU verification result is:" << (pass ? "correct" : "fail") << std::endl;
+        pass = do_verify(c_m_n_dev_result, c_m_n_ref, rtol_atol, "CPU");
    }
    else if(arg_parser.get_int("v") == 2)
    {
@@ -343,29 +466,18 @@ int run_gemm_example_with_layouts(int argc,
            // Restore input for B for gpu reference
            b_k_n_dev_buf.ToDevice(b_k_n.data());
        }
-        ck_tile::HostTensor<CDataType> c_m_n_gpu_ref(
-            ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
-        ck_tile::DeviceMem c_m_n_gpu_buf_ref(c_m_n_gpu_ref.get_element_space_size_in_bytes());
-        c_m_n_gpu_ref.SetZero();
+        if constexpr(GemmConfig::Preshuffle)
+        {
+            b_k_n_dev_buf.ToDevice(b_k_n.data());
+        }
+
+        // memory on device to store gpu reference result
+        ck_tile::DeviceMem c_m_n_gpu_buf_ref(c_m_n_ref.get_element_space_size_in_bytes());
        c_m_n_gpu_buf_ref.SetZero();

-        ADataType* d_A;
-        BDataType* d_B;
-        CDataType* d_C;
-
-        ck_tile::hip_check_error(hipMalloc(&d_A, a_m_k.get_element_space_size_in_bytes()));
-        ck_tile::hip_check_error(hipMalloc(&d_B, b_k_n.get_element_space_size_in_bytes()));
-        ck_tile::hip_check_error(
-            hipMalloc(&d_C, c_m_n_dev_result.get_element_space_size_in_bytes()));
-
-        ck_tile::hip_check_error(hipMemcpy(d_A,
-                                           a_m_k_dev_buf.GetDeviceBuffer(),
-                                           a_m_k.get_element_space_size_in_bytes(),
-                                           hipMemcpyHostToDevice));
-        ck_tile::hip_check_error(hipMemcpy(d_B,
-                                           b_k_n_dev_buf.GetDeviceBuffer(),
-                                           b_k_n.get_element_space_size_in_bytes(),
-                                           hipMemcpyHostToDevice));
+        ADataType* d_A = static_cast<ADataType*>(a_m_k_dev_buf.GetDeviceBuffer());
+        BDataType* d_B = static_cast<BDataType*>(b_k_n_dev_buf.GetDeviceBuffer());
+        CDataType* d_C = static_cast<CDataType*>(c_m_n_gpu_buf_ref.GetDeviceBuffer());

        ck_tile::reference_gemm_gpu<ADataType,
                                    BDataType,
@@ -375,30 +487,13 @@ int run_gemm_example_with_layouts(int argc,
                                    BLayout,
                                    CLayout>(d_A, d_B, d_C, M, N, K, stride_A, stride_B, stride_C);

-        ck_tile::hip_check_error(hipMemcpy(c_m_n_gpu_buf_ref.GetDeviceBuffer(),
-                                           d_C,
-                                           c_m_n_dev_result.get_element_space_size_in_bytes(),
-                                           hipMemcpyDeviceToHost));
+        c_m_n_gpu_buf_ref.FromDevice(c_m_n_ref.data());

-        ck_tile::hip_check_error(hipFree(d_A));
-        ck_tile::hip_check_error(hipFree(d_B));
-        ck_tile::hip_check_error(hipFree(d_C));
-
-        c_m_n_gpu_buf_ref.FromDevice(c_m_n_gpu_ref.data());
        const float max_accumulated_value =
-            *std::max_element(c_m_n_gpu_ref.mData.begin(), c_m_n_gpu_ref.mData.end());
+            *std::max_element(c_m_n_ref.mData.begin(), c_m_n_ref.mData.end());
        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
            K, kbatch, max_accumulated_value);
-        pass = ck_tile::check_err(c_m_n_dev_result,
-                                  c_m_n_gpu_ref,
-                                  "Error: Incorrect results!",
-                                  rtol_atol.at(ck_tile::number<0>{}),
-                                  rtol_atol.at(ck_tile::number<1>{}));
-
-        std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
-                  << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
-                  << std::endl;
-        std::cout << "The GPU verification result is: " << (pass ? "correct" : "fail") << std::endl;
+        pass = do_verify(c_m_n_dev_result, c_m_n_ref, rtol_atol, "GPU");
    }

    return pass;
--- a/example/ck_tile/03_gemm/universal_gemm.cpp
+++ b/example/ck_tile/03_gemm/universal_gemm.cpp
@@ -11,15 +11,22 @@

 #include "ck_tile/host.hpp"
 #include "gemm_utils.hpp"
+#include "run_gemm_example.inc"

-template <typename ADataType,
+template <typename GemmConfig,
+          typename ADataType,
          typename BDataType,
+          typename DsDataType,
          typename AccDataType,
          typename CDataType,
          typename ALayout,
          typename BLayout,
-          typename CLayout>
-float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
+          typename DsLayout,
+          typename ELayout,
+          bool Persistent,
+          typename CDEElementWise>
+float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
+
 {
    using GemmShape = ck_tile::TileGemmShape<
        ck_tile::sequence<GemmConfig::M_Tile, GemmConfig::N_Tile, GemmConfig::K_Tile>,
@@ -28,42 +35,52 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
            sequence<GemmConfig::M_Warp_Tile, GemmConfig::N_Warp_Tile, GemmConfig::K_Warp_Tile>,
        GemmConfig::PermuteA,
        GemmConfig::PermuteB>;
+
    using TilePartitioner =
        ck_tile::GemmSpatiallyLocalTilePartitioner<GemmShape,
                                                   GemmConfig::TileParitionerGroupNum,
                                                   GemmConfig::TileParitionerM01>;

-    using Traits              = ck_tile::TileGemmTraits<GemmConfig::kPadM,
+    using Traits = ck_tile::TileGemmTraits<GemmConfig::kPadM,
                                           GemmConfig::kPadN,
                                           GemmConfig::kPadK,
                                           ALayout,
                                           BLayout,
-                                           CLayout>;
+                                           ELayout,
+                                           GemmConfig::NumWaveGroups>;
+
    using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<GemmConfig::kPadM,
                                                                 GemmConfig::kPadN,
                                                                 GemmConfig::kPadK,
                                                                 GemmConfig::DoubleSmemBuffer,
                                                                 ALayout,
                                                                 BLayout,
-                                                                 CLayout,
-                                                                 GemmConfig::TransposeC>;
+                                                                 ELayout,
+                                                                 GemmConfig::TransposeC,
+                                                                 GemmConfig::UseStructuredSparsity,
+                                                                 Persistent,
+                                                                 GemmConfig::NumWaveGroups,
+                                                                 GemmConfig::Preshuffle>;
    using GemmPipelineProblem =
        ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;

-    using BaseGemmPipeline = UNIVERSAL_GEMM_PIPELINE<GemmPipelineProblem>;
+    using BaseGemmPipeline = typename PipelineTypeTraits<
+        GemmConfig::Pipeline>::template UniversalGemmPipeline<GemmPipelineProblem>;

    const ck_tile::index_t k_grain     = args.k_batch * GemmConfig::K_Tile;
    const ck_tile::index_t K_split     = (args.K + k_grain - 1) / k_grain * GemmConfig::K_Tile;
    const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(K_split);
    const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
    const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
-
    float ave_time{0};

-    const auto Run = [&](const auto has_hot_loop_, const auto tail_number_) {
-        constexpr bool has_hot_loop_v = has_hot_loop_.value;
-        constexpr auto tail_number_v  = tail_number_.value;
-        constexpr auto scheduler      = GEMM_PIPELINE_SCHEDULER;
+    const auto Run = [&](const auto has_hot_loop_,
+                         const auto tail_number_,
+                         const auto memory_operation_) {
+        constexpr bool has_hot_loop_v   = has_hot_loop_.value;
+        constexpr auto tail_number_v    = tail_number_.value;
+        constexpr auto scheduler        = GemmConfig::Scheduler;
+        constexpr auto memory_operation = memory_operation_.value;

        using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
                                                                           BDataType,
@@ -74,14 +91,19 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
                                                                           has_hot_loop_v,
                                                                           tail_number_v>;

-        using GemmPipeline = GEMM_PIPELINE<UniversalGemmProblem>;
+        using GemmPipeline = typename PipelineTypeTraits<
+            GemmConfig::Pipeline>::template GemmPipeline<UniversalGemmProblem>;
+
        using GemmEpilogue = ck_tile::CShuffleEpilogue<
            ck_tile::CShuffleEpilogueProblem<ADataType,
                                             BDataType,
+                                             DsDataType,
                                             AccDataType,
                                             CDataType,
-                                             CLayout,
-                                             GemmPipelineProblem::kBlockSize,
+                                             DsLayout,
+                                             ELayout,
+                                             CDEElementWise,
+                                             UniversalGemmProblem::kBlockSize,
                                             TilePartitioner::MPerBlock,
                                             TilePartitioner::NPerBlock,
                                             GemmConfig::M_Warp,
@@ -89,11 +111,22 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&
                                             GemmConfig::M_Warp_Tile,
                                             GemmConfig::N_Warp_Tile,
                                             GemmConfig::K_Warp_Tile,
-                                             UniversalGemmProblem::TransposeC>>;
+                                             UniversalGemmProblem::TransposeC,
+                                             memory_operation,
+                                             GemmConfig::NumWaveGroups>>;
+
        using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
        auto kargs   = Kernel::MakeKernelArgs(args);

-        const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch);
+        dim3 grids;
+        if constexpr(Persistent)
+        {
+            grids = Kernel::MaxOccupancyGridSize(s);
+        }
+        else
+        {
+            grids = Kernel::GridSize(args.M, args.N, args.k_batch);
+        }
        constexpr dim3 blocks = Kernel::BlockSize();

        if(!Kernel::IsSupportedArgument(kargs))
@@ -103,167 +136,105 @@ float gemm_calc(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config&

        if(s.log_level_ > 0)
        {
-            std::cout << "Launching kernel with args:"
-                      << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+            std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
+                      << "shape: " << GemmShape::GetName() << '\n'
+                      << "problem: " << UniversalGemmProblem::GetName() << '\n'
+                      << "pipeline: " << GemmPipeline::GetName() << '\n'
+                      << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
                      << std::endl;
        }
+        if(s.flush_cache_)
+        {
+            std::cout << "Flushing cache..." << std::endl;

-        ave_time = ck_tile::launch_kernel(s,
-                                          ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
-                                              Kernel{}, grids, blocks, 0, kargs));
+            ck_tile::HostTensor<ADataType> a_m(ck_tile::host_tensor_descriptor(
+                args.M, args.K, args.stride_A, is_row_major(ALayout{})));
+            ck_tile::HostTensor<BDataType> b_n(ck_tile::host_tensor_descriptor(
+                args.K, args.N, args.stride_B, is_row_major(BLayout{})));
+
+            auto size_a_buffer = a_m.get_element_space_size_in_bytes();
+            auto size_b_buffer = b_n.get_element_space_size_in_bytes();
+
+            ck_tile::RotatingMemWrapper<ADataType, BDataType> rotating_mem(
+                kargs.as_ptr[0], kargs.bs_ptr[0], s.rotating_count_, size_a_buffer, size_b_buffer);
+            rotating_mem.Print();
+
+            auto run_flush_cache = [&]() {
+                // flush icache
+                ck_tile::flush_icache();
+                // rotating mem
+                rotating_mem.Next();
+                // clear c mem
+                if(args.k_batch > 1)
+                    hipGetErrorString(hipMemsetAsync(
+                        args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
+            };
+            ave_time = ck_tile::launch_kernel_time_mask(
+                s,
+                run_flush_cache,
+                ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
+                    Kernel{}, grids, blocks, 0, kargs));
+        }
+        else
+        {
+            ave_time =
+                ck_tile::launch_kernel(s,
+                                       ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
+                                           Kernel{}, grids, blocks, 0, kargs));
+        }
        return ave_time;
    };

-    if(has_hot_loop)
-    {
-#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3)
-        if(tail_num == ck_tile::TailNumber::Full)
+    const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
+        if(args.k_batch == 1)
        {
-            Run(ck_tile::bool_constant<true>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
-        }
-        else if(tail_num == ck_tile::TailNumber::Odd)
-        {
-            Run(ck_tile::bool_constant<true>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
-        }
-        else if(tail_num == ck_tile::TailNumber::Even)
-        {
-            Run(ck_tile::bool_constant<true>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Even>{});
+            Run(has_hot_loop_, tail_number_, MemoryOpSet{});
        }
        else
        {
-            std::ostringstream err;
-            err << "For compute pipeline tail number should always be Full, but have \"" << tail_num
-                << "\" which is not supported! PrefetchStages: " << BaseGemmPipeline::PrefetchStages
-                << "\n File: " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__;
-            throw std::runtime_error(err.str());
-        }
-#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
-        // Tail pipeline One to Seven
-        if(tail_num == ck_tile::TailNumber::One)
-        {
-            Run(ck_tile::bool_constant<true>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::One>{});
-        }
-        else if(tail_num == ck_tile::TailNumber::Full)
-        {
-            Run(ck_tile::bool_constant<true>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
+            Run(has_hot_loop_, tail_number_, MemoryOpAtomicAdd{});
        }
+    };

-        if constexpr(BaseGemmPipeline::PrefetchStages > 2)
-        {
-            if(tail_num == ck_tile::TailNumber::Two)
-            {
-                Run(ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Two>{});
-            }
-        }
-        if constexpr(BaseGemmPipeline::PrefetchStages > 3)
-        {
-            if(tail_num == ck_tile::TailNumber::Three)
-            {
-                Run(ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Three>{});
-            }
-        }
-        if constexpr(BaseGemmPipeline::PrefetchStages > 4)
-        {
-            if(tail_num == ck_tile::TailNumber::Four)
-            {
-                Run(ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Four>{});
-            }
-        }
-        if constexpr(BaseGemmPipeline::PrefetchStages > 5)
-        {
-            if(tail_num == ck_tile::TailNumber::Five)
-            {
-                Run(ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Five>{});
-            }
-        }
-        if constexpr(BaseGemmPipeline::PrefetchStages > 6)
-        {
-            if(tail_num == ck_tile::TailNumber::Six)
-            {
-                Run(ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Six>{});
-            }
-        }
-        if constexpr(BaseGemmPipeline::PrefetchStages > 7)
-        {
-            if(tail_num == ck_tile::TailNumber::Seven)
-            {
-                Run(ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Seven>{});
-            }
-        }
-#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V4)
-        if(tail_num == ck_tile::TailNumber::Three)
-        {
-            Run(ck_tile::bool_constant<true>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Three>{});
-        }
-        else
-        {
-            Run(ck_tile::bool_constant<true>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Two>{});
-        }
-#endif
-    }
-    else
-    {
-        if(tail_num == ck_tile::TailNumber::Full)
-        {
-            Run(ck_tile::bool_constant<false>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
-        }
-        else if(tail_num == ck_tile::TailNumber::Odd)
-        {
-            Run(ck_tile::bool_constant<false>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
-        }
-        else if(tail_num == ck_tile::TailNumber::Even)
-        {
-            Run(ck_tile::bool_constant<false>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
-        }
-        else
-        {
-            std::ostringstream err;
-            err << "Num K loop must be larger than number of prefetech stages."
-                << "\n PrefetchStages: " << BaseGemmPipeline::PrefetchStages
-                << "\n File: " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__;
-            throw std::runtime_error(err.str());
-        }
-    }
-
+    BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);
    return ave_time;
 }

-#include "run_gemm_example.inc"
-
-template <typename APrecType, typename BPrecType = APrecType, typename CPrecType = APrecType>
-int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int argc, char* argv[])
+template <typename GemmConfig,
+          typename APrecType,
+          typename BPrecType = APrecType,
+          typename CPrecType = APrecType>
+int run_gemm_example_prec_type(std::string a_layout,
+                               std::string b_layout,
+                               ck_tile::ArgParser& arg_parser)
 {
-    using Row = ck_tile::tensor_layout::gemm::RowMajor;
-    using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
+    using Row       = ck_tile::tensor_layout::gemm::RowMajor;
+    using Col       = ck_tile::tensor_layout::gemm::ColumnMajor;
+    bool preshuffle = GemmConfig::Preshuffle;
+
+    if(preshuffle && std::is_same_v<BPrecType, ck_tile::pk_int4_t>)
+    {
+        throw std::runtime_error("Preshuffle is not supported for this int4 datatype!");
+    }
+
+    if(preshuffle && a_layout != "R" && b_layout != "C")
+    {
+        throw std::runtime_error(
+            "Preshuffle is supported only for A(Row major), B(column major) input matrices!");
+    }

    if constexpr(std::is_same_v<BPrecType, ck_tile::pk_int4_t>)
    {
        if(a_layout == "R" && b_layout == "C")
        {
-            return run_gemm_example_with_layouts<APrecType, BPrecType, CPrecType>(
-                argc, argv, Row{}, Col{}, Row{});
+            return run_gemm_example_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
+                arg_parser, Row{}, Col{}, Row{});
        }
        else if(a_layout == "C" && b_layout == "C")
        {
-            return run_gemm_example_with_layouts<APrecType, BPrecType, CPrecType>(
-                argc, argv, Col{}, Col{}, Row{});
+            return run_gemm_example_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
+                arg_parser, Col{}, Col{}, Row{});
        }
        else
        {
@@ -275,23 +246,23 @@ int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int a
    {
        if(a_layout == "R" && b_layout == "R")
        {
-            return run_gemm_example_with_layouts<APrecType, BPrecType, CPrecType>(
-                argc, argv, Row{}, Row{}, Row{});
+            return run_gemm_example_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
+                arg_parser, Row{}, Row{}, Row{});
        }
        else if(a_layout == "R" && b_layout == "C")
        {
-            return run_gemm_example_with_layouts<APrecType, BPrecType, CPrecType>(
-                argc, argv, Row{}, Col{}, Row{});
+            return run_gemm_example_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
+                arg_parser, Row{}, Col{}, Row{});
        }
        else if(a_layout == "C" && b_layout == "R")
        {
-            return run_gemm_example_with_layouts<APrecType, BPrecType, CPrecType>(
-                argc, argv, Col{}, Row{}, Row{});
+            return run_gemm_example_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
+                arg_parser, Col{}, Row{}, Row{});
        }
        else if(a_layout == "C" && b_layout == "C")
        {
-            return run_gemm_example_with_layouts<APrecType, BPrecType, CPrecType>(
-                argc, argv, Col{}, Col{}, Row{});
+            return run_gemm_example_with_layouts<GemmConfig, APrecType, BPrecType, CPrecType>(
+                arg_parser, Col{}, Col{}, Row{});
        }
        else
        {
@@ -300,43 +271,59 @@ int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int a
    }
 }

-int run_gemm_example(int argc, char* argv[])
+template <template <typename PreType> typename GemmConfig>
+int run_gemm_example(ck_tile::ArgParser& arg_parser)
 {
-    auto [result, arg_parser] = create_args(argc, argv);
-    if(!result)
-        return -1;
-
    std::string data_type = arg_parser.get_str("prec");
    std::string a_layout  = arg_parser.get_str("a_layout");
    std::string b_layout  = arg_parser.get_str("b_layout");

    if(data_type == "fp16")
    {
-        return run_gemm_example_prec_type<ck_tile::half_t>(a_layout, b_layout, argc, argv);
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::half_t>, ck_tile::half_t>(
+            a_layout, b_layout, arg_parser);
    }
    else if(data_type == "bf16")
    {
-        return run_gemm_example_prec_type<ck_tile::bf16_t>(a_layout, b_layout, argc, argv);
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::half_t>, ck_tile::bf16_t>(
+            a_layout, b_layout, arg_parser);
    }
    else if(data_type == "fp8")
    {
-        return run_gemm_example_prec_type<ck_tile::fp8_t, ck_tile::fp8_t, ck_tile::half_t>(
-            a_layout, b_layout, argc, argv);
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::fp8_t>,
+                                          ck_tile::fp8_t,
+                                          ck_tile::fp8_t,
+                                          ck_tile::half_t>(a_layout, b_layout, arg_parser);
    }
    else if(data_type == "bf8")
    {
-        return run_gemm_example_prec_type<ck_tile::bf8_t, ck_tile::bf8_t, ck_tile::half_t>(
-            a_layout, b_layout, argc, argv);
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::bf8_t>,
+                                          ck_tile::bf8_t,
+                                          ck_tile::bf8_t,
+                                          ck_tile::half_t>(a_layout, b_layout, arg_parser);
+    }
+    else if(data_type == "int8")
+    {
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::int8_t>,
+                                          ck_tile::int8_t,
+                                          ck_tile::int8_t,
+                                          ck_tile::int32_t>(a_layout, b_layout, arg_parser);
    }
-
-#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3)
    else if(data_type == "pk_int4_t")
    {
        // TODO: Add support for bhalf_t ADataType
-        return run_gemm_example_prec_type<ck_tile::half_t, ck_tile::pk_int4_t, ck_tile::half_t>(
-            a_layout, b_layout, argc, argv);
+        if constexpr(GemmConfig<ck_tile::half_t>::Pipeline == CK_TILE_PIPELINE_COMPUTE_V3)
+        {
+            return run_gemm_example_prec_type<GemmConfig<ck_tile::half_t>,
+                                              ck_tile::half_t,
+                                              ck_tile::pk_int4_t,
+                                              ck_tile::half_t>(a_layout, b_layout, arg_parser);
+        }
+        else
+        {
+            throw std::runtime_error("Unsupported pipeline for this operation !!!");
+        }
    }
-#endif
    else
    {
        throw std::runtime_error("Unsupported data type for this operation !!!");
@@ -345,9 +332,13 @@ int run_gemm_example(int argc, char* argv[])

 int main(int argc, char* argv[])
 {
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
    try
    {
-        run_gemm_example(argc, argv);
+        return !run_gemm_example<GemmConfigComputeV3>(arg_parser);
    }
    catch(const std::runtime_error& e)
    {
@@ -355,5 +346,4 @@ int main(int argc, char* argv[])
        // Return a non-zero code to indicate failure
        return EXIT_FAILURE;
    }
-    return EXIT_SUCCESS;
 }
--- a/example/ck_tile/04_img2col/image_to_column.cpp
+++ b/example/ck_tile/04_img2col/image_to_column.cpp
@@ -149,9 +149,17 @@ int main(int argc, char* argv[])
    float ave_time =
        image_to_column(traits, args, ck_tile::stream_config{nullptr, config.time_kernel});

-    std::size_t num_btype = G * NHoWo * CYX * (sizeof(OutDataType) + sizeof(InDataType));
-    float gb_per_sec      = num_btype / 1.E6 / ave_time;
-    std::cout << "Perf: " << ave_time << " ms, " << gb_per_sec << " GB/s" << std::endl;
+    if(config.time_kernel)
+    {
+        std::size_t num_btype = G * NHoWo * CYX * (sizeof(OutDataType) + sizeof(InDataType));
+        float gb_per_sec      = num_btype / 1.E6 / ave_time;
+        std::cout << "Perf: " << ave_time << " ms, " << gb_per_sec << " GB/s" << std::endl;
+    }
+    else
+    {
+        std::cout << "image_to_column: pass, No Perf generated due to config.time_kernel=0"
+                  << std::endl;
+    }

    bool pass = true;

--- a/example/ck_tile/05_reduce/CMakeLists.txt
+++ b/example/ck_tile/05_reduce/CMakeLists.txt
@@ -1,7 +1,7 @@
 set(EXAMPLE_REDUCE "tile_example_reduce")
 # not using add_example_executable() to add this target, since we don't want this to have
 # to be included in "make all/install/check"
-message("adding example ${EXAMPLE_REDUCE}")
+message(DEBUG "adding example ${EXAMPLE_REDUCE}")

 add_executable(${EXAMPLE_REDUCE} EXCLUDE_FROM_ALL reduce.cpp)
 target_include_directories(${EXAMPLE_REDUCE} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
--- a/example/ck_tile/05_reduce/reduce.cpp
+++ b/example/ck_tile/05_reduce/reduce.cpp
@@ -1,16 +1,21 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
 #include "ck_tile/host.hpp"
-#include "reduce.hpp"
+#include "ck_tile/ops/reduce.hpp"
 #include <cstring>

 auto create_args(int argc, char* argv[])
 {
    ck_tile::ArgParser arg_parser;
-    arg_parser.insert("m", "3328", "m dimension")
-        .insert("n", "4096", "n dimension")
+    arg_parser.insert("n", "32", "n dimension")
+        .insert("h", "7", "h dimension")
+        .insert("w", "7", "w dimension")
+        .insert("c", "512", "c dimension")
        .insert("v", "1", "cpu validation or not")
        .insert("prec", "fp16", "precision")
-        .insert("warmup", "5", "cold iter")
-        .insert("repeat", "20", "hot iter");
+        .insert("warmup", "0", "cold iter")
+        .insert("repeat", "1", "hot iter");

    bool result = arg_parser.parse(argc, argv);
    return std::make_tuple(result, arg_parser);
@@ -23,15 +28,28 @@ bool run(const ck_tile::ArgParser& arg_parser)
    using ComputeDataType = float;
    using YDataType       = DataType;

-    ck_tile::index_t m = arg_parser.get_int("m");
-    ck_tile::index_t n = arg_parser.get_int("n");
+    ck_tile::index_t N = arg_parser.get_int("n");
+    ck_tile::index_t H = arg_parser.get_int("h");
+    ck_tile::index_t W = arg_parser.get_int("w");
+    ck_tile::index_t C = arg_parser.get_int("c");
    int do_validation  = arg_parser.get_int("v");
    int warmup         = arg_parser.get_int("warmup");
    int repeat         = arg_parser.get_int("repeat");

-    ck_tile::HostTensor<XDataType> x_host({m, n});
-    ck_tile::HostTensor<YDataType> y_host_ref({m});
-    ck_tile::HostTensor<YDataType> y_host_dev({m});
+    std::vector<ck_tile::index_t> problem_shape = {N, H, W, C};
+    std::vector<ck_tile::index_t> strides(4);
+    strides[0] = H * W * C;
+    strides[1] = W * C;
+    strides[2] = C;
+    strides[3] = 1;
+
+    // Define reduction specification:
+    constexpr auto kept_dim    = ck_tile::sequence<0, 3>{}; // Which dimension to keep
+    constexpr auto reduce_dims = ck_tile::sequence<1, 2>{}; // Which dimensions to reduce
+
+    ck_tile::HostTensor<XDataType> x_host(problem_shape, strides);
+    ck_tile::HostTensor<YDataType> y_host_ref({N, C}, {C, 1});
+    ck_tile::HostTensor<YDataType> y_host_dev({N, C}, {C, 1});

    ck_tile::FillUniformDistribution<XDataType>{-5.f, 5.f}(x_host);

@@ -54,7 +72,9 @@ bool run(const ck_tile::ArgParser& arg_parser)

    constexpr ck_tile::index_t kBlockSize  = 256;
    constexpr ck_tile::index_t kBlockPerCu = 1;
-    ck_tile::index_t kGridSize             = (m / BlockTile::at(ck_tile::number<0>{}));
+    ck_tile::index_t kept_dim_len_prod     = N * C;
+    ck_tile::index_t kGridSize = (kept_dim_len_prod + BlockTile::at(ck_tile::number<0>{}) - 1) /
+                                 BlockTile::at(ck_tile::number<0>{});
    std::cout << "grid size " << kGridSize << std::endl;

    using Shape = ck_tile::Reduce2dShape<BlockWarps, BlockTile, WarpTile, Vector>;
@@ -63,6 +83,17 @@ bool run(const ck_tile::ArgParser& arg_parser)

    using Kernel = ck_tile::Reduce<Porblem>;

+    // Create input tensor shape and strides
+    auto input_shape =
+        ck_tile::make_tuple(problem_shape[0], problem_shape[1], problem_shape[2], problem_shape[3]);
+    auto input_strides = ck_tile::make_tuple(strides[0], strides[1], strides[2], strides[3]);
+
+    if(!Kernel::IsSupportedArgument(
+           C, input_strides)) // output tensor's continuous dimension and input strides
+    {
+        throw std::runtime_error("Wrong! Arguments not supported!\n");
+    }
+
    float ave_time = launch_kernel(ck_tile::stream_config{nullptr, true, 0, warmup, repeat},
                                   ck_tile::make_kernel<kBlockSize, kBlockPerCu>(
                                       Kernel{},
@@ -71,10 +102,12 @@ bool run(const ck_tile::ArgParser& arg_parser)
                                       0,
                                       static_cast<XDataType*>(x_buf.GetDeviceBuffer()),
                                       static_cast<YDataType*>(y_buf.GetDeviceBuffer()),
-                                       m,
-                                       n));
+                                       input_shape,
+                                       input_strides,
+                                       kept_dim,
+                                       reduce_dims));

-    std::size_t num_btype = sizeof(XDataType) * m * n + sizeof(YDataType) * m;
+    std::size_t num_btype = sizeof(XDataType) * N * C * H * W + sizeof(YDataType) * N * C;

    float gb_per_sec = num_btype / 1.E6 / ave_time;

@@ -86,7 +119,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
    {
        // reference
        ck_tile::reference_reduce<XDataType, ComputeDataType, YDataType>(
-            x_host, y_host_ref, ReduceOp{});
+            x_host, y_host_ref, ReduceOp{}, kept_dim, reduce_dims);
        y_buf.FromDevice(y_host_dev.mData.data());
        pass = ck_tile::check_err(y_host_dev, y_host_ref);

--- a/example/ck_tile/05_reduce/reduce.hpp
+++ b/example/ck_tile/05_reduce/reduce.hpp
@@ -1,164 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include "ck_tile/core.hpp"
-#include "ck_tile/ops/common.hpp"
-#include "ck_tile/ops/reduce/block/block_reduce.hpp"
-#include "ck_tile/ops/reduce/block/block_reduce2d_default_policy.hpp"
-
-namespace ck_tile {
-
-template <typename BlockWarps, // num warps along seq<M, N>
-          typename BlockTile,  // block size, seq<M, N>
-          typename WarpTile,   // warp size, seq<M, N>
-          typename Vector>     // contiguous pixels(vector size) along seq<M, N>
-struct Reduce2dShape
-{
-    static constexpr index_t Block_M = BlockTile::at(number<0>{});
-    static constexpr index_t Block_N = BlockTile::at(number<1>{});
-
-    static constexpr index_t Warp_M = WarpTile::at(number<0>{});
-    static constexpr index_t Warp_N = WarpTile::at(number<1>{});
-
-    static constexpr index_t Vector_M = Vector::at(number<0>{});
-    static constexpr index_t Vector_N = Vector::at(number<1>{});
-
-    static constexpr index_t WarpPerBlock_M = BlockWarps::at(number<0>{});
-    static constexpr index_t WarpPerBlock_N = BlockWarps::at(number<1>{});
-
-    static constexpr index_t ThreadPerWarp_M = Warp_M / Vector_M;
-    static constexpr index_t ThreadPerWarp_N = Warp_N / Vector_N;
-
-    static constexpr index_t Repeat_M = Block_M / (WarpPerBlock_M * Warp_M);
-    static constexpr index_t Repeat_N = Block_N / (WarpPerBlock_N * Warp_N);
-
-    static constexpr index_t BlockSize =
-        warpSize * reduce_on_sequence(BlockWarps{}, multiplies{}, number<1>{});
-};
-
-template <typename XDataType_,
-          typename ComputeDataType_,
-          typename YDataType_,
-          typename BlockShape_,
-          typename ReduceOp_>
-struct Reduce2dProblem
-{
-    using XDataType       = remove_cvref_t<XDataType_>;
-    using ComputeDataType = remove_cvref_t<ComputeDataType_>;
-    using YDataType       = remove_cvref_t<YDataType_>;
-    using BlockShape      = remove_cvref_t<BlockShape_>;
-    using ReduceOp        = ReduceOp_;
-
-    static constexpr bool kNeedCrossLaneSync = BlockShape::ThreadPerWarp_N > 1;
-    static constexpr bool kNeedCrossWarpSync = BlockShape::WarpPerBlock_N > 1;
-};
-
-template <typename Problem_, typename Policy_ = BlockReduce2dDefaultPolicy>
-struct Reduce
-{
-    using Problem = ck_tile::remove_cvref_t<Problem_>;
-    using Policy  = ck_tile::remove_cvref_t<Policy_>;
-
-    using XDataType       = ck_tile::remove_cvref_t<typename Problem::XDataType>;
-    using ComputeDataType = ck_tile::remove_cvref_t<typename Problem::ComputeDataType>;
-    using YDataType       = ck_tile::remove_cvref_t<typename Problem::YDataType>;
-
-#if 0
-    CK_TILE_DEVICE void operator()(const XDataType* p_x, YDataType* p_y, index_t M, index_t N)
-    const
-    {
-        using S = typename Problem::BlockShape;
-
-        const auto x_m_n = make_naive_tensor_view<address_space_enum::global>(
-            p_x, make_tuple(M, N), make_tuple(N, 1), number<S::Vector_N>{}, number<1>{});
-
-        const auto y_m = make_naive_tensor_view_packed<address_space_enum::global>(
-            p_y, make_tuple(M), number<1>{});
-
-        const auto iM = get_block_id() * S::Block_M;
-
-        auto x_window = make_tile_window(x_m_n,
-                                         make_tuple(number<S::Block_M>{}, number<S::Block_N>{}),
-                                         {iM, 0},
-                                         Policy::template MakeXBlockTileDistribution<Problem>());
-
-        auto y_window = make_tile_window(y_m, make_tuple(number<S::Block_M>{}), {iM});
-
-        const auto f_reduce = [](const auto& v0, const auto& v1) { return v0 + v1; };
-
-        const XDataType reduce_init_value = 0;
-
-        constexpr auto reduce_dims = sequence<1>{};
-
-        auto y_compute = decltype(block_tile_reduce<ComputeDataType>(
-            load_tile(x_window), reduce_dims, f_reduce, reduce_init_value)){};
-
-        set_tile(y_compute, reduce_init_value);
-
-        index_t num_n_tile_iteration =
-            __builtin_amdgcn_readfirstlane(integer_divide_ceil(N, S::Block_N));
-
-        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
-        {
-            const auto x = load_tile(x_window);
-            block_tile_reduce(y_compute, x, reduce_dims, f_reduce);
-            move_tile_window(x_window, {0, S::Block_N});
-        }
-
-        block_tile_reduce_sync(y_compute, f_reduce);
-
-        store_tile(y_window, cast_tile<YDataType>(y_compute));
-    }
-#else
-    CK_TILE_DEVICE void operator()(const XDataType* p_x, YDataType* p_y, index_t M, index_t N) const
-    {
-        using S = typename Problem::BlockShape;
-
-        const auto x_m_n = make_naive_tensor_view<address_space_enum::global>(
-            p_x, make_tuple(M, N), make_tuple(N, 1), number<S::Vector_N>{}, number<1>{});
-
-        const auto y_m = make_naive_tensor_view_packed<address_space_enum::global>(
-            p_y, make_tuple(M), number<1>{});
-
-        const auto iM = get_block_id() * S::Block_M;
-
-        auto x_window = make_tile_window(x_m_n,
-                                         make_tuple(number<S::Block_M>{}, number<S::Block_N>{}),
-                                         {iM, 0},
-                                         Policy::template MakeXBlockTileDistribution<Problem>());
-
-        auto y_window = make_tile_window(y_m, make_tuple(number<S::Block_M>{}), {iM});
-
-        __shared__ char smem[Policy::template GetSmemSize<Problem>()];
-
-        index_t num_n_tile_iteration =
-            __builtin_amdgcn_readfirstlane(integer_divide_ceil(N, S::Block_N));
-
-        auto reduce_func         = typename Problem::ReduceOp{};
-        auto block_reduce2d      = Policy::template GetBlockReduce2d<Problem>();
-        auto block_reduce2d_sync = Policy::template GetBlockReduce2dSync<Problem>();
-        auto block_reduce2d_cross_warp_sync =
-            Policy::template GetBlockReduce2dCrossWarpSync<Problem>();
-
-        using XTensorType = decltype(load_tile(x_window));
-        auto y_compute    = block_reduce2d.template MakeYBlockTile<XTensorType>();
-        set_tile(y_compute, reduce_func.template GetIdentityValue<ComputeDataType>());
-
-        for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
-        {
-            const auto x = load_tile(x_window);
-            block_reduce2d(x, y_compute, reduce_func);
-            move_tile_window(x_window, {0, S::Block_N});
-        }
-
-        block_reduce2d_sync(y_compute, reduce_func);
-        block_reduce2d_cross_warp_sync(y_compute, smem, reduce_func);
-
-        store_tile(y_window, cast_tile<YDataType>(y_compute));
-    }
-#endif
-};
-
-} // namespace ck_tile
--- a/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle_kernel.hpp
+++ b/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle_kernel.hpp
@@ -333,12 +333,12 @@ struct matrix_core_swizzle_kernel
                    return tmp_1;
 #else
                    // b_nr_kr_waveflatten = b_nr_kr_kw_nw_kv,
-                    constexpr index_t kv = Alignment;
-                    constexpr index_t nw = WarpGemm::WarpGemmAttribute::Impl::kAMLane;
-                    constexpr index_t kw = WarpGemm::WarpGemmAttribute::Impl::kABKLane;
+                    constexpr index_t kv          = Alignment;
+                    constexpr index_t nw          = WarpGemm::WarpGemmAttribute::Impl::kAMLane;
+                    constexpr index_t kw          = WarpGemm::WarpGemmAttribute::Impl::kABKLane;
                    constexpr index_t waveflatten = kw * nw * kv;
-                    const index_t kr = a_.k / (k1 * k2);
-                    const index_t nr = a_.n / nw;
+                    const index_t kr              = a_.k / (k1 * k2);
+                    const index_t nr              = a_.n / nw;
                    auto tmp = make_naive_tensor_view_packed<address_space_enum::global>(
                        p_dst,
                        make_tuple(nr, kr, waveflatten),
@@ -387,8 +387,8 @@ struct matrix_core_swizzle_kernel
                    constexpr index_t nw = WarpGemm::WarpGemmAttribute::Impl::kAMLane;
                    constexpr index_t kw = WarpGemm::WarpGemmAttribute::Impl::kABKLane;
                    constexpr index_t waveflatten_tile = kw * nw * kv;
-                    constexpr index_t nr_tile = NPerBlock / nw;
-                    constexpr index_t kr_tile = KPerBlock / (kw * kv);
+                    constexpr index_t nr_tile          = NPerBlock / nw;
+                    constexpr index_t kr_tile          = KPerBlock / (kw * kv);
                    return make_tile_window(dst_view,
                                            make_tuple(number<nr_tile>{},
                                                       number<kr_tile>{},
--- a/example/ck_tile/10_rmsnorm2d/CMakeLists.txt
+++ b/example/ck_tile/10_rmsnorm2d/CMakeLists.txt
@@ -25,7 +25,7 @@ add_custom_command(

 set(TILE_RMSNORM2D_FWD "tile_rmsnorm2d_fwd")

-message("adding ${TILE_RMSNORM2D_FWD}")
+message(DEBUG "adding ${TILE_RMSNORM2D_FWD}")
 add_executable(${TILE_RMSNORM2D_FWD} EXCLUDE_FROM_ALL rmsnorm2d_fwd.cpp)
 target_include_directories(${TILE_RMSNORM2D_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
 target_sources(${TILE_RMSNORM2D_FWD} PRIVATE ${RMSNORM2D_FWD_GEN_BLOBS})
--- a/example/ck_tile/10_rmsnorm2d/example_rmsnorm2d_fwd.cpp
+++ b/example/ck_tile/10_rmsnorm2d/example_rmsnorm2d_fwd.cpp
@@ -15,13 +15,14 @@ auto create_args(int argc, char* argv[])
        .insert("v", "1", "cpu validation or not")
        .insert("prec", "fp16", "precision")
        .insert("warmup", "0", "cold iter")
-        .insert("repeat", "1", "hot iter");
+        .insert("repeat", "1", "hot iter")
+        .insert("s", "0", "sensitive model mode, 0: for no specific model, 1: for T5-like model");

    bool result = arg_parser.parse(argc, argv);
    return std::make_tuple(result, arg_parser);
 }

-template <typename DataType>
+template <typename DataType, int USEModelSensitive>
 bool run(const ck_tile::ArgParser& arg_parser)
 {
    ck_tile::index_t m      = arg_parser.get_int("m");
@@ -81,8 +82,10 @@ bool run(const ck_tile::ArgParser& arg_parser)
                                    false, // kSaveInvRms
                                    false, // kSaveUnquant
                                    kTwoPass,
-                                    ck_tile::Rmsnorm2dFusedAddEnum::NO_ADD,      // fuse add
-                                    ck_tile::Rmsnorm2dFusedQuantEnum::NO_SWEEP>; // fuse quant
+                                    ck_tile::Rmsnorm2dFusedAddEnum::NO_ADD,     // fuse add
+                                    ck_tile::Rmsnorm2dFusedQuantEnum::NO_SWEEP, // fuse quant
+                                    static_cast<ck_tile::Rmsnorm2dSensitiveEnum>(
+                                        USEModelSensitive)>;

    using Problem = ck_tile::Rmsnorm2dFwdPipelineProblem<XDataType,
                                                         GammaDataType,
@@ -97,7 +100,17 @@ bool run(const ck_tile::ArgParser& arg_parser)

    using OnePassPipeline = ck_tile::Rmsnorm2dFwdPipelineOnePass<Problem>;
    using TwoPassPipeline = ck_tile::Rmsnorm2dFwdPipelineTwoPass<Problem>;
-    using Pipeline        = std::conditional_t<kTwoPass, TwoPassPipeline, OnePassPipeline>;
+    using T5PassPipeline  = ck_tile::Rmsnorm2dFwdPipelineModelSensitiveT5Pass<Problem>;
+
+    using Pipeline =
+        std::conditional_t<(PipelineTraits::kUseModelSensitiveRMSNorm ==
+                                ck_tile::Rmsnorm2dSensitiveEnum::NO_SPECIFIC_MODEL ||
+                            PipelineTraits::kTwoPass), // TODO: consider TwoPass for T5PassPipeline
+                           std::conditional_t<PipelineTraits::kTwoPass,
+                                              TwoPassPipeline,
+                                              OnePassPipeline>, // kUseModelSensitiveRMSNorm
+                                                                // == 0
+                           T5PassPipeline>;

    using Default2DEpilogueProblem = ck_tile::
        Default2DEpilogueProblem<ComputeDataType, YDataType, false, PipelineTraits::kPadN, false>;
@@ -170,9 +183,9 @@ bool run(const ck_tile::ArgParser& arg_parser)
            }
        }

-        std::cout << "[" << data_type << "]"
-                  << " m:" << m << ", n:" << n << ", stride:" << stride
-                  << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl;
+        std::cout << "[" << data_type << "]" << " m:" << m << ", n:" << n << ", stride:" << stride
+                  << ", s:" << USEModelSensitive << ", valid:" << (pass ? "y" : "n") << std::flush
+                  << std::endl;
    }

    return pass;
@@ -184,10 +197,19 @@ int main(int argc, char* argv[])
    if(!result)
        return -1;

-    const std::string data_type = arg_parser.get_str("prec");
+    const std::string data_type           = arg_parser.get_str("prec");
+    const int use_model_sensitive_rmsnorm = arg_parser.get_int("s");
+
    if(data_type == "fp16")
    {
-        return run<ck_tile::half_t>(arg_parser) ? 0 : -2;
+        if(use_model_sensitive_rmsnorm == 0) // 0: for no specific RMSNorm
+        {
+            return run<ck_tile::half_t, 0>(arg_parser) ? 0 : -2;
+        }
+        else if(use_model_sensitive_rmsnorm == 1) // 1: for T5-like RMSNorm
+        {
+            return run<ck_tile::half_t, 1>(arg_parser) ? 0 : -2;
+        }
    }

    return -3;
--- a/example/ck_tile/10_rmsnorm2d/generate.py
+++ b/example/ck_tile/10_rmsnorm2d/generate.py
@@ -65,7 +65,8 @@ template <typename XDataType_,
          bool kSaveUnquant_,
          bool kTwoPass_,
          ck_tile::index_t kFusedAdd_ = 0,
-          ck_tile::index_t kFusedQuant_ = 0>
+          ck_tile::index_t kFusedQuant_ = 0,
+          ck_tile::index_t kUseModelSensitiveRMSNorm_ = 0>
 struct rmsnorm2d_fwd_traits_
 {
    using XDataType           = ck_tile::remove_cvref_t<XDataType_>;
@@ -74,22 +75,22 @@ struct rmsnorm2d_fwd_traits_
    using YScaleDataType      = ck_tile::remove_cvref_t<YScaleDataType_>;
    using UnquantYDataType    = ck_tile::remove_cvref_t<UnquantYDataType_>;

-    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= warpSize;
-    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % warpSize == 0);
+    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= ck_tile::get_warp_size();
+    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % ck_tile::get_warp_size() == 0);
    static constexpr ck_tile::index_t total_warps =
-        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / warpSize;
+        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / ck_tile::get_warp_size();

    // num of warps along m
    static constexpr ck_tile::index_t BlockWarps_M = []() {
        if constexpr(is_warp_per_row)
        {
-            static_assert(warpSize % ThreadPerBlock_N_ == 0);
-            return total_warps * (warpSize / ThreadPerBlock_N_);
+            static_assert(ck_tile::get_warp_size() % ThreadPerBlock_N_ == 0);
+            return total_warps * (ck_tile::get_warp_size() / ThreadPerBlock_N_);
        }
        else
        {
-            // static_assert(warpSize % ThreadPerBlock_M_ == 0);
-            return total_warps / (ThreadPerBlock_N_ / warpSize);
+            // static_assert(ck_tile::get_warp_size() % ThreadPerBlock_M_ == 0);
+            return total_warps / (ThreadPerBlock_N_ / ck_tile::get_warp_size());
        }
    }();

@@ -97,13 +98,13 @@ struct rmsnorm2d_fwd_traits_
    static constexpr ck_tile::index_t BlockWarps_N = []() {
        if constexpr(is_warp_per_row)
        {
-            static_assert(warpSize % ThreadPerBlock_N_ == 0);
+            static_assert(ck_tile::get_warp_size() % ThreadPerBlock_N_ == 0);
            return 1;
        }
        else
        {
-            static_assert(ThreadPerBlock_N_ % warpSize == 0);
-            return ThreadPerBlock_N_ / warpSize;
+            static_assert(ThreadPerBlock_N_ % ck_tile::get_warp_size() == 0);
+            return ThreadPerBlock_N_ / ck_tile::get_warp_size();
        }
    }();

@@ -127,8 +128,9 @@ struct rmsnorm2d_fwd_traits_
    static constexpr bool kSaveInvRms  = kSaveInvRms_;
    static constexpr bool kSaveUnquant = kSaveUnquant_;
    static constexpr bool kTwoPass     = kTwoPass_;
-    static constexpr ck_tile::index_t kFusedAdd = kFusedAdd_;
-    static constexpr ck_tile::index_t kFusedQuant = kFusedQuant_;
+    static constexpr ck_tile::index_t kFusedAdd                 = kFusedAdd_;
+    static constexpr ck_tile::index_t kFusedQuant               = kFusedQuant_;
+    static constexpr ck_tile::index_t kUseModelSensitiveRMSNorm = kUseModelSensitiveRMSNorm_;
 };

 template <typename XDataType_,
@@ -146,7 +148,8 @@ template <typename XDataType_,
          bool kSaveUnquant_,
          bool kTwoPass_,
          int  kFusedAdd_,
-          int  kFusedQuant_>
+          int  kFusedQuant_,
+          int  kUseModelSensitiveRMSNorm_>
 using traits_ = rmsnorm2d_fwd_traits_<XDataType_,
                                      YDataType_,
                                      SmoothScaleDataType_,
@@ -162,7 +165,8 @@ using traits_ = rmsnorm2d_fwd_traits_<XDataType_,
                                      kSaveUnquant_,
                                      kTwoPass_,
                                      kFusedAdd_,
-                                      kFusedQuant_>;
+                                      kFusedQuant_,
+                                      kUseModelSensitiveRMSNorm_>;
 """

    API_COMMON_HEADER = """
@@ -197,7 +201,8 @@ float rmsnorm2d_fwd_(const S& s, A a)
                                    Traits_::kSaveUnquant,
                                    Traits_::kTwoPass,
                                    static_cast<ck_tile::Rmsnorm2dFusedAddEnum>(Traits_::kFusedAdd),
-                                    static_cast<ck_tile::Rmsnorm2dFusedQuantEnum>(Traits_::kFusedQuant)>;
+                                    static_cast<ck_tile::Rmsnorm2dFusedQuantEnum>(Traits_::kFusedQuant),
+                                    static_cast<ck_tile::Rmsnorm2dSensitiveEnum>(Traits_::kUseModelSensitiveRMSNorm)>;

    using PipelineProblem =
        ck_tile::Rmsnorm2dFwdPipelineProblem<typename RmsnormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::XDataType,
@@ -213,7 +218,13 @@ float rmsnorm2d_fwd_(const S& s, A a)

    using OnePassPipeline = ck_tile::Rmsnorm2dFwdPipelineOnePass<PipelineProblem>;
    using TwoPassPipeline = ck_tile::Rmsnorm2dFwdPipelineTwoPass<PipelineProblem>;
-    using Pipeline        = std::conditional_t<Traits_::kTwoPass, TwoPassPipeline, OnePassPipeline>;
+    using T5PassPipeline  = ck_tile::Rmsnorm2dFwdPipelineModelSensitiveT5Pass<PipelineProblem>;
+
+    using Pipeline = std::conditional_t<
+        (Traits_::kUseModelSensitiveRMSNorm == 0 || Traits_::kTwoPass), // TODO: consider TwoPass for T5PassPipeline
+        std::conditional_t<Traits_::kTwoPass, TwoPassPipeline, OnePassPipeline>,  // kUseModelSensitiveRMSNorm == 0
+        T5PassPipeline
+    >;

    using Default2DEpilogueProblem = ck_tile::Default2DEpilogueProblem<ComputeDataType, YDataType, false, Traits_::kPadN, false>;
    using Default2DEpilogue = ck_tile::Default2DEpilogue<Default2DEpilogueProblem>;
@@ -387,12 +398,13 @@ float rmsnorm2d_fwd(rmsnorm2d_fwd_traits t,
        F_kTwoPass : bool
        F_kFusedAdd : int
        F_kFusedQuant : int
+        F_use_model_sensitive_rmsnorm : int

        @property
        def trait_name(self) ->str:
            t_ = f'{DATA_TYPE_MAP[self.F_XDataType]}, {DATA_TYPE_MAP[self.F_YDataType]}, {DATA_TYPE_MAP[self.F_SmoothScaleDataType]}, {DATA_TYPE_MAP[self.F_YScaleDataType]}, {DATA_TYPE_MAP[self.F_UnquantYDataType]}, {self.F_Repeat_M:2}, {self.F_Repeat_N:2}, {self.F_ThreadPerBlock_M:2}, {self.F_ThreadPerBlock_N:4}'
            t_ += f', {self.F_Vector_N:2}, {BOOL_MAP(self.F_kPadN):5}, {BOOL_MAP(self.F_kSaveInvRms):5}, {BOOL_MAP(self.F_kSaveUnquant):5}'
-            t_ += f', {BOOL_MAP(self.F_kTwoPass):5}, {self.F_kFusedAdd:4}, {self.F_kFusedQuant:4}'
+            t_ += f', {BOOL_MAP(self.F_kTwoPass):5}, {self.F_kFusedAdd:4}, {self.F_kFusedQuant:4}, {self.F_use_model_sensitive_rmsnorm:4}'
            return t_

        # string when calling this kernel
@@ -413,6 +425,7 @@ float rmsnorm2d_fwd(rmsnorm2d_fwd_traits t,
        F_add : int
        F_sweep : int
        F_saveunquant : bool
+        F_use_model_sensitive_rmsnorm : int
        instance_list : List[Any] # List[h_traits]

        @property
@@ -426,6 +439,10 @@ float rmsnorm2d_fwd(rmsnorm2d_fwd_traits t,
                nnn = nnn + '_' + FUSED_FUSED_SWEEP_STR_MAP[self.F_sweep]
            if self.F_saveunquant:
                nnn = nnn + '_saveunquant'
+            if self.F_use_model_sensitive_rmsnorm == 0:
+                nnn = nnn + '_nsm'
+            elif self.F_use_model_sensitive_rmsnorm == 1:
+                nnn = nnn + '_t5ml'
            return nnn

        @property
@@ -481,9 +498,9 @@ float rmsnorm2d_fwd(rmsnorm2d_fwd_traits t,
                        elif ins.F_kFusedQuant == 2:
                            _sweep_cond = 't.fused_quant == {f_fused_sweep} && (t.prec_sy == \"{f_sy_type}\" && t.save_unquant == {f_suq})'.format(
                                f_fused_sweep = ins.F_kFusedQuant, f_sy_type=ins.F_YScaleDataType, f_suq=BOOL_MAP(ins.F_kSaveUnquant))
-                        _cond = '((a.n % {f_vec_n} == 0) && (t.fused_add == {f_fused_add}) && ({f_sweep_cond}))'.format(
+                        _cond = '((a.n % {f_vec_n} == 0) && (t.fused_add == {f_fused_add}) && ({f_sweep_cond}) && (t.use_model_sensitive_rmsnorm == {f_use_model_sensitive_rmsnorm}) )'.format(
                                        f_vec_n = ins.F_Vector_N, f_fused_add = ins.F_kFusedAdd,
-                                        f_sweep_cond = _sweep_cond)
+                                        f_sweep_cond = _sweep_cond, f_use_model_sensitive_rmsnorm = ins.F_use_model_sensitive_rmsnorm)
                        inner_str += self.API_INNER_CASE.format(F_if = get_if_str(idx_in_n, len_in_n, False),
                                            F_VEC_COND = _cond, F_instance_func=ins.call_name)
                    #inner_str = inner_str + vec_str
@@ -516,85 +533,149 @@ float rmsnorm2d_fwd(rmsnorm2d_fwd_traits t,
        fused_sweep_list = [0, 1, 2] # NOTE: only single pass can use fused (smooth) dynamic quant
        bool_list = [False, True]

-        #                                                              rm  rn  tm   tn  vn  pd     mv     unquant  2p     add    sweep
-        h_trait_dict = {'64'  : [ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  8,  8,  8,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  4,  16, 4,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  4,  64, 1,  True,  False, False,   False,   0,    0)],
-                        '128' : [ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  4,  16, 8,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  4,  64, 2,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  2,  4,  64, 1,  True,  False, False,   False,   0,    0)],
-                        '256' : [ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  4,  64, 4,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  2,  4,  64, 2,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  4,  64, 1,  True,  False, False,   False,   0,    0)],
-                        '512' : [ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  4,  64, 8,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  2,  4,  64, 4,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  4,  64, 2,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  8,  4,  64, 1,  True,  False, False,   False,   0,    0)],
-                        '640' : [ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  5,  4,  64, 2,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  5,  4, 128, 1,  True,  False, False,   False,   0,    0)],
-                        '768' : [ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  4,  64, 4,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  6,  4,  64, 2,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1, 12,  4,  64, 1,  True,  False, False,   False,   0,    0)],
-                        '1024' :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  2,  2,  64, 8,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  2,  64, 4,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  8,  2,  64, 2,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1, 256, 1,  True,  False, False,   False,   0,    0)],
-                        '1536' :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  4,  64, 8,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  2, 128, 4,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  1, 256, 2,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  6,  1, 256, 1,  True,  False, False,   False,   0,    0)],
-                        '2048' :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  1, 256, 8,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  2,  1, 256, 4,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1, 256, 2,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  8,  1, 256, 1,  True,  False, False,   False,   0,    0)],
-                        '3072' :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  1, 128, 8,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  1, 256, 4,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  6,  1, 256, 2,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  1,1024, 1,  True,  False, False,   False,   0,    0)],
-                        '4096' :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  2,  1, 256, 8,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1, 256, 4,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  2,  1,1024, 2,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1,1024, 1,  True,  False, False,   False,   0,    0)],
-                        '6144' :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  1, 256, 8,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  1, 512, 4,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  1,1024, 2,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  6,  1,1024, 1,  True,  False, False,   False,   0,    0)],
-                        '8192' :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1, 256, 8,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1, 512, 4,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1,1024, 2,  True,  False, False,   False,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  8,  1,1024, 1,  True,  False, False,   False,   0,    0)],
-                        'big'  :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  1,1024, 8,  True,  False, False,    True,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1, 256, 4,  True,  False, False,    True,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1, 12,  1, 256, 2,  True,  False, False,    True,   0,    0),
-                                  h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1,1024, 1,  True,  False, False,    True,   0,    0)]}
+        h_trait_dicts = {
+            0: {
+                #                                              rm  rn  tm   tn  vn  pd     mv     unquant  2p     add    sweep  srm
+                '64'   :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  8,  8,  8,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  4,  16, 4,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  4,  64, 1,  True,  False, False,   False,   0,    0,    0)],
+                '128'  :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  4,  16, 8,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  4,  64, 2,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  2,  4,  64, 1,  True,  False, False,   False,   0,    0,    0)],
+                '256'  :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  4,  64, 4,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  2,  4,  64, 2,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  4,  64, 1,  True,  False, False,   False,   0,    0,    0)],
+                '512'  :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  4,  64, 8,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  2,  4,  64, 4,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  4,  64, 2,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  8,  4,  64, 1,  True,  False, False,   False,   0,    0,    0)],
+                '640'  :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  5,  4,  64, 2,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  5,  4, 128, 1,  True,  False, False,   False,   0,    0,    0)],
+                '768'  :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  4,  64, 4,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  6,  4,  64, 2,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1, 12,  4,  64, 1,  True,  False, False,   False,   0,    0,    0)],
+                '1024' :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  2,  2,  64, 8,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  2,  64, 4,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  8,  2,  64, 2,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1, 256, 1,  True,  False, False,   False,   0,    0,    0)],
+                '1536' :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  4,  64, 8,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  2, 128, 4,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  1, 256, 2,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  6,  1, 256, 1,  True,  False, False,   False,   0,    0,    0)],
+                '2048' :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  1, 256, 8,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  2,  1, 256, 4,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1, 256, 2,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  8,  1, 256, 1,  True,  False, False,   False,   0,    0,    0)],
+                '3072' :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  1, 128, 8,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  1, 256, 4,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  6,  1, 256, 2,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  1,1024, 1,  True,  False, False,   False,   0,    0,    0)],
+                '4096' :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  2,  1, 256, 8,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1, 256, 4,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  2,  1,1024, 2,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1,1024, 1,  True,  False, False,   False,   0,    0,    0)],
+                '6144' :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  1, 256, 8,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  1, 512, 4,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  1,1024, 2,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  6,  1,1024, 1,  True,  False, False,   False,   0,    0,    0)],
+                '8192' :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1, 256, 8,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1, 512, 4,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1,1024, 2,  True,  False, False,   False,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  8,  1,1024, 1,  True,  False, False,   False,   0,    0,    0)],
+                'big'  :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  1,1024, 8,  True,  False, False,    True,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1, 256, 4,  True,  False, False,    True,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1, 12,  1, 256, 2,  True,  False, False,    True,   0,    0,    0),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1,1024, 1,  True,  False, False,    True,   0,    0,    0)]
+            },
+            1: {
+                #                                               rm  rn  tm   tn  vn  pd     mv     unquant  2p     add    sweep srm
+                '64'   :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  8,  8,  8,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  4,  16, 4,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  4,  64, 1,  True,  False, False,   False,   0,    0,    1)],
+                '128'  :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  4,  16, 8,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  4,  64, 2,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  2,  4,  64, 1,  True,  False, False,   False,   0,    0,    1)],
+                '256'  :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  8,  32, 8,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  4,  64, 4,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  2,  4,  64, 2,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  4,  64, 1,  True,  False, False,   False,   0,    0,    1)],
+                '512'  :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  4,  64, 8,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  2,  4,  64, 4,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  4,  64, 2,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  8,  4,  64, 1,  True,  False, False,   False,   0,    0,    1)],
+                '640'  :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  2, 128, 8,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  5,  4,  64, 2,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  5,  4, 128, 1,  True,  False, False,   False,   0,    0,    1)],
+                '768'  :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  2, 128, 8,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  4,  64, 4,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  6,  4,  64, 2,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1, 12,  4,  64, 1,  True,  False, False,   False,   0,    0,    1)],
+                '1024' :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  2, 128, 8,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  2,  64, 4,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  8,  2,  64, 2,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1, 256, 1,  True,  False, False,   False,   0,    0,    1)],
+                '1536' :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  1, 256, 8,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  2, 128, 4,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  1, 256, 2,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  6,  1, 256, 1,  True,  False, False,   False,   0,    0,    1)],
+                '2048' :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  1, 256, 8,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  2,  1, 256, 4,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1, 256, 2,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  8,  1, 256, 1,  True,  False, False,   False,   0,    0,    1)],
+                '3072' :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  2,  1, 256, 8,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  1, 256, 4,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  6,  1, 256, 2,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  1,1024, 1,  True,  False, False,   False,   0,    0,    1)],
+                '4096' :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  2,  1, 256, 8,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1, 256, 4,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  2,  1,1024, 2,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1,1024, 1,  True,  False, False,   False,   0,    0,    1)],
+                '6144' :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  1, 256, 8,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  1, 512, 4,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  3,  1,1024, 2,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  6,  1,1024, 1,  True,  False, False,   False,   0,    0,    1)],
+                '8192' :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1, 256, 8,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1, 512, 4,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1,1024, 2,  True,  False, False,   False,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  8,  1,1024, 1,  True,  False, False,   False,   0,    0,    1)],
+                'big'  :[ h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  1,  1,1024, 8,  True,  False, False,    True,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1, 256, 4,  True,  False, False,    True,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1, 12,  1, 256, 2,  True,  False, False,    True,   0,    0,    1),
+                          h_traits('x', 'y', 'xs', 'ys', 'uqy', 1,  4,  1,1024, 1,  True,  False, False,    True,   0,    0,    1)]
+            } 
+        }
+        
        total_blob = list()
-        for hs_key in h_trait_dict:
-            hs = h_trait_dict[hs_key]
-            current_n = hs[0].F_Repeat_N * hs[0].F_ThreadPerBlock_N * hs[0].F_Vector_N
-            for dtype, scale_type, fused_add, fused_quant, save_unquant in itertools.product(dtype_list, scale_list, fused_add_list, fused_sweep_list, bool_list):
-                prec_i, prec_o = dtype.split(',')
-                scale_sm, scale_y = scale_type.split(',')
-                if prec_o in dynamic_quant_out_dtype and fused_quant != 1 and fused_quant != 2:
-                    continue # skip non dynamic quant case
-                if (fused_quant == 1 or fused_quant == 2) and hs_key == 'big':
-                    continue
-                if (fused_quant == 0 and save_unquant == True):
-                    continue # save_unquant should always be false when there is no quant enabled
-                current_hs = list()
-                for chs_ in hs:
-                    h_ = copy.copy(chs_) # copy the base instance out
-                    h_.F_XDataType = prec_i
-                    h_.F_YDataType = prec_o
-                    h_.F_SmoothScaleDataType = scale_sm
-                    h_.F_YScaleDataType = scale_y
-                    h_.F_UnquantYDataType = prec_i
-                    h_.F_kFusedAdd = fused_add
-                    h_.F_kFusedQuant = fused_quant
-                    h_.F_kSaveUnquant = save_unquant
-                    current_hs.append(h_) # + "\n"
-                #f.write(str(f.parent / GEN_DIR / (blobs.api_common_header_
-                current_n_str = 'big' if hs_key == 'big' else current_n
-                total_blob.append(h_instance(dtype, current_n_str, fused_add, fused_quant, save_unquant, current_hs))
+
+        for model_sensitive_flag in [0, 1]: # 0: default; 1: model sensitive
+            current_trait_dict = h_trait_dicts[model_sensitive_flag]
+            for hs_key in current_trait_dict:
+                hs = current_trait_dict[hs_key]            
+                current_n = hs_key
+                for dtype, scale_type, fused_add, fused_quant, save_unquant in itertools.product(dtype_list, scale_list, fused_add_list, fused_sweep_list, bool_list):
+                    prec_i, prec_o = dtype.split(',')
+                    scale_sm, scale_y = scale_type.split(',')
+                    if prec_o in dynamic_quant_out_dtype and fused_quant != 1 and fused_quant != 2:
+                        continue # skip non dynamic quant case
+                    if (fused_quant == 1 or fused_quant == 2) and hs_key == 'big':
+                        continue
+                    if (fused_quant == 0 and save_unquant == True):
+                        continue # save_unquant should always be false when there is no quant enabled
+                    current_hs = list()
+                    for chs_ in hs:
+                        h_ = copy.copy(chs_) # copy the base instance out
+                        h_.F_XDataType = prec_i
+                        h_.F_YDataType = prec_o
+                        h_.F_SmoothScaleDataType = scale_sm
+                        h_.F_YScaleDataType = scale_y
+                        h_.F_UnquantYDataType = prec_i
+                        h_.F_kFusedAdd = fused_add
+                        h_.F_kFusedQuant = fused_quant
+                        h_.F_kSaveUnquant = save_unquant
+                        current_hs.append(h_) # + "\n"
+                    #f.write(str(f.parent / GEN_DIR / (blobs.api_common_header_
+                    current_n_str = 'big' if hs_key == 'big' else current_n
+                    total_blob.append(h_instance(dtype, current_n_str, fused_add, fused_quant, save_unquant, h_.F_use_model_sensitive_rmsnorm, current_hs))
        return total_blob

    def list_blobs(self) -> None:
@@ -712,4 +793,4 @@ if __name__ == "__main__":
    if args.list_blobs:
        list_blobs(args)
    else:
-        gen_blobs(args)
+        gen_blobs(args)
--- a/example/ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.cpp
+++ b/example/ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.cpp
@@ -52,7 +52,8 @@ auto create_args(int argc, char* argv[])
        .insert("fadd", "0", "fused-add, 0:no fused add, 1:preadd+store, 2:preadd only")
        .insert("fquant", "0", "fused-quant, 0:no, 1:smooth-dynamic-quant, 2:dynamic-quant")
        .insert("warmup", "5", "cold iter")
-        .insert("repeat", "20", "hot iter");
+        .insert("repeat", "20", "hot iter")
+        .insert("s", "0", "sensitive model mode, 0: for no specific model, 1: for T5-like model");

    bool result = arg_parser.parse(argc, argv);
    return std::make_tuple(result, arg_parser);
@@ -66,15 +67,16 @@ template <typename InDataType,
          bool SaveUnquant>
 bool run(const ck_tile::ArgParser& arg_parser)
 {
-    ck_tile::index_t m = arg_parser.get_int("m");
-    ck_tile::index_t n = arg_parser.get_int("n");
-    float epsilon      = arg_parser.get_float("e");
-    int kname          = arg_parser.get_int("kname");
-    int do_validation  = arg_parser.get_int("v");
-    int fused_add      = arg_parser.get_int("fadd");
-    int fused_quant    = arg_parser.get_int("fquant");
-    int warmup         = arg_parser.get_int("warmup");
-    int repeat         = arg_parser.get_int("repeat");
+    ck_tile::index_t m                    = arg_parser.get_int("m");
+    ck_tile::index_t n                    = arg_parser.get_int("n");
+    float epsilon                         = arg_parser.get_float("e");
+    int kname                             = arg_parser.get_int("kname");
+    int do_validation                     = arg_parser.get_int("v");
+    int fused_add                         = arg_parser.get_int("fadd");
+    int fused_quant                       = arg_parser.get_int("fquant");
+    int warmup                            = arg_parser.get_int("warmup");
+    int repeat                            = arg_parser.get_int("repeat");
+    const int use_model_sensitive_rmsnorm = arg_parser.get_int("s");

    ck_tile::index_t x_stride = arg_parser.get_int("x_stride");
    if(x_stride < 0)
@@ -191,13 +193,19 @@ bool run(const ck_tile::ArgParser& arg_parser)
        return base_str;
    }();

-    std::cout << "[" << prec_str << "]"
-              << " m:" << m << ", n:" << n << ", x_stride:" << x_stride
+    std::cout << "[" << prec_str << "]" << " m:" << m << ", n:" << n << ", x_stride:" << x_stride
              << ", xr_stride:" << xr_stride << ", y_stride:" << y_stride
-              << ", yr_stride:" << yr_stride << std::flush;
+              << ", yr_stride:" << yr_stride << ", s:" << use_model_sensitive_rmsnorm << std::flush;

-    rmsnorm2d_fwd_traits traits{
-        prec_i, prec_o, prec_sm, prec_sy, SaveRms, SaveUnquant, fused_add, fused_quant};
+    rmsnorm2d_fwd_traits traits{prec_i,
+                                prec_o,
+                                prec_sm,
+                                prec_sy,
+                                SaveRms,
+                                SaveUnquant,
+                                fused_add,
+                                fused_quant,
+                                use_model_sensitive_rmsnorm};

    rmsnorm2d_fwd_args args{x_buf.GetDeviceBuffer(),
                            fused_add != 0 ? x_residual_buf.GetDeviceBuffer() : nullptr,
--- a/example/ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.hpp
+++ b/example/ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.hpp
@@ -64,6 +64,8 @@ struct rmsnorm2d_fwd_traits
    bool save_unquant;
    int fused_add;   // 0:no-add, 1:pre-add-store, 2:pre-add
    int fused_quant; // 0:no-sweep, 1:smooth-dynamic-quant, 2:dynamic-quant
+
+    int use_model_sensitive_rmsnorm = 0; // 0: Use default RMSNorm; 1: Use T5-like implementation
 };

 float rmsnorm2d_fwd(rmsnorm2d_fwd_traits, rmsnorm2d_fwd_args, const ck_tile::stream_config&);
--- a/example/ck_tile/10_rmsnorm2d/script/perf_test.sh
+++ b/example/ck_tile/10_rmsnorm2d/script/perf_test.sh
@@ -1,37 +1,74 @@
 #!/bin/sh
 EXE="$(find . -name tile_rmsnorm2d_fwd -type f | head -n 1)"

-$EXE -m=1 -n=1 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
-$EXE -m=700 -n=80 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
-$EXE -m=700 -n=128 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
-$EXE -m=700 -n=144 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
-$EXE -m=700 -n=168 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
-$EXE -m=700 -n=184 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
-$EXE -m=700 -n=256 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
-$EXE -m=700 -n=288 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
-$EXE -m=700 -n=344 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
-$EXE -m=700 -n=376 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
-$EXE -m=700 -n=448 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
-$EXE -m=700 -n=512 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
-$EXE -m=700 -n=924 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
-$EXE -m=700 -n=1024 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
-$EXE -m=700 -n=1078 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
-$EXE -m=700 -n=1996 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
-$EXE -m=700 -n=4080 -e=1e-12 -v=1 -prec=bf16 -repeat=1000
+# 0: for no specific RMSNorm
+$EXE -m=1 -n=1 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=0
+$EXE -m=700 -n=80 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=0
+$EXE -m=700 -n=128 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=0
+$EXE -m=700 -n=144 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=0
+$EXE -m=700 -n=168 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=0
+$EXE -m=700 -n=184 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=0
+$EXE -m=700 -n=256 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=0
+$EXE -m=700 -n=288 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=0
+$EXE -m=700 -n=344 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=0
+$EXE -m=700 -n=376 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=0
+$EXE -m=700 -n=448 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=0
+$EXE -m=700 -n=512 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=0
+$EXE -m=700 -n=924 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=0
+$EXE -m=700 -n=1024 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=0
+$EXE -m=700 -n=1078 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=0
+$EXE -m=700 -n=1996 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=0
+$EXE -m=700 -n=4080 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=0

-$EXE -m=700 -n=80 -e=1e-12 -v=1  -prec=fp16 -repeat=1000
-$EXE -m=700 -n=128 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
-$EXE -m=700 -n=144 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
-$EXE -m=700 -n=168 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
-$EXE -m=700 -n=184 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
-$EXE -m=700 -n=256 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
-$EXE -m=700 -n=288 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
-$EXE -m=700 -n=344 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
-$EXE -m=700 -n=376 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
-$EXE -m=700 -n=448 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
-$EXE -m=700 -n=512 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
-$EXE -m=700 -n=924 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
-$EXE -m=700 -n=1024 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
-$EXE -m=700 -n=1078 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
-$EXE -m=700 -n=1996 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
-$EXE -m=700 -n=4080 -e=1e-12 -v=1 -prec=fp16 -repeat=1000
+$EXE -m=700 -n=80 -e=1e-12 -v=1  -prec_i=fp16 -repeat=1000 -s=0
+$EXE -m=700 -n=128 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=0
+$EXE -m=700 -n=144 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=0
+$EXE -m=700 -n=168 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=0
+$EXE -m=700 -n=184 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=0
+$EXE -m=700 -n=256 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=0
+$EXE -m=700 -n=288 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=0
+$EXE -m=700 -n=344 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=0
+$EXE -m=700 -n=376 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=0
+$EXE -m=700 -n=448 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=0
+$EXE -m=700 -n=512 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=0
+$EXE -m=700 -n=924 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=0
+$EXE -m=700 -n=1024 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=0
+$EXE -m=700 -n=1078 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=0
+$EXE -m=700 -n=1996 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=0
+$EXE -m=700 -n=4080 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=0
+
+# 1: for T5-like RMSNorm
+$EXE -m=1 -n=1 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=1
+$EXE -m=700 -n=80 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=1
+$EXE -m=700 -n=128 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=1
+$EXE -m=700 -n=144 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=1
+$EXE -m=700 -n=168 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=1
+$EXE -m=700 -n=184 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=1
+$EXE -m=700 -n=256 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=1
+$EXE -m=700 -n=288 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=1
+$EXE -m=700 -n=344 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=1
+$EXE -m=700 -n=376 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=1
+$EXE -m=700 -n=448 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=1
+$EXE -m=700 -n=512 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=1
+$EXE -m=700 -n=924 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=1
+$EXE -m=700 -n=1024 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=1
+$EXE -m=700 -n=1078 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=1
+$EXE -m=700 -n=1996 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=1
+$EXE -m=700 -n=4080 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000 -s=1
+
+$EXE -m=700 -n=80 -e=1e-12 -v=1  -prec_i=fp16 -repeat=1000 -s=1
+$EXE -m=700 -n=128 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=1
+$EXE -m=700 -n=144 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=1
+$EXE -m=700 -n=168 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=1
+$EXE -m=700 -n=184 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=1
+$EXE -m=700 -n=256 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=1
+$EXE -m=700 -n=288 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=1
+$EXE -m=700 -n=344 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=1
+$EXE -m=700 -n=376 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=1
+$EXE -m=700 -n=448 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=1
+$EXE -m=700 -n=512 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=1
+$EXE -m=700 -n=924 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=1
+$EXE -m=700 -n=1024 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=1
+$EXE -m=700 -n=1078 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=1
+$EXE -m=700 -n=1996 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=1
+$EXE -m=700 -n=4080 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000 -s=1
--- a/example/ck_tile/10_rmsnorm2d/script/smoke_test.sh
+++ b/example/ck_tile/10_rmsnorm2d/script/smoke_test.sh
@@ -5,29 +5,32 @@ for fquant in "" "-fquant=1 -prec_o=int8" "-fquant=2 -prec_o=int8" "-fquant=1 -p
  "-fquant=1 -prec_o=int8 -save_unquant=1" "-fquant=2 -prec_o=int8 -save_unquant=1" "-fquant=1 -prec_o=fp8 -save_unquant=1" "-fquant=2 -prec_o=fp8 -save_unquant=1"; do
 for pr_i in "fp16" "bf16" ; do
 for fadd in "0" "1"; do
-$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=99  -n=13
-$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=17  -n=16
-$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=1   -n=100
-$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=4   -n=128
-$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=80  -n=127
-$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=22  -n=255 -stride=256
-$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=7   -n=599
-$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=19  -n=512
-$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=33  -n=313 -stride=1000
-$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=11  -n=510
-$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=171 -n=676 -stride=818
-$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=91  -n=636
-$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=12  -n=768 -stride=800
-$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=100 -n=766 -stride=812
-$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=31  -n=1024
-$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=64  -n=1000 -stride=1004
-$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=8   -n=1501
-$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=3   -n=1826
-$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=5   -n=2040
-$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=7   -n=2734
-$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=1   -n=3182
-$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=9   -n=4096
-$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=3   -n=8192
+# 0: for no specific RMSNorm; 1: for T-5 like RMSNorm
+for s in "0" "1"; do
+$EXE -prec_i=$pr_i -fadd=$fadd -s=$s $fquant -m=99  -n=13
+$EXE -prec_i=$pr_i -fadd=$fadd -s=$s $fquant -m=17  -n=16
+$EXE -prec_i=$pr_i -fadd=$fadd -s=$s $fquant -m=1   -n=100
+$EXE -prec_i=$pr_i -fadd=$fadd -s=$s $fquant -m=4   -n=128
+$EXE -prec_i=$pr_i -fadd=$fadd -s=$s $fquant -m=80  -n=127
+# $EXE -prec_i=$pr_i -fadd=$fadd -s=$s $fquant -m=22  -n=255 -stride=256
+$EXE -prec_i=$pr_i -fadd=$fadd -s=$s $fquant -m=7   -n=599
+$EXE -prec_i=$pr_i -fadd=$fadd -s=$s $fquant -m=19  -n=512
+# $EXE -prec_i=$pr_i -fadd=$fadd -s=$s $fquant -m=33  -n=313 -stride=1000
+$EXE -prec_i=$pr_i -fadd=$fadd -s=$s $fquant -m=11  -n=510
+# $EXE -prec_i=$pr_i -fadd=$fadd -s=$s $fquant -m=171 -n=676 -stride=818
+$EXE -prec_i=$pr_i -fadd=$fadd -s=$s $fquant -m=91  -n=636
+# $EXE -prec_i=$pr_i -fadd=$fadd -s=$s $fquant -m=12  -n=768 -stride=800
+# $EXE -prec_i=$pr_i -fadd=$fadd -s=$s $fquant -m=100 -n=766 -stride=812
+$EXE -prec_i=$pr_i -fadd=$fadd -s=$s $fquant -m=31  -n=1024
+# $EXE -prec_i=$pr_i -fadd=$fadd -s=$s $fquant -m=64  -n=1000 -stride=1004
+$EXE -prec_i=$pr_i -fadd=$fadd -s=$s $fquant -m=8   -n=1501
+$EXE -prec_i=$pr_i -fadd=$fadd -s=$s $fquant -m=3   -n=1826
+$EXE -prec_i=$pr_i -fadd=$fadd -s=$s $fquant -m=5   -n=2040
+$EXE -prec_i=$pr_i -fadd=$fadd -s=$s $fquant -m=7   -n=2734
+$EXE -prec_i=$pr_i -fadd=$fadd -s=$s $fquant -m=1   -n=3182
+$EXE -prec_i=$pr_i -fadd=$fadd -s=$s $fquant -m=9   -n=4096
+$EXE -prec_i=$pr_i -fadd=$fadd -s=$s $fquant -m=3   -n=8192
+done
 done
 done
 done
@@ -36,8 +39,11 @@ done
 for fquant in ""
 for pr_i in "fp16" "bf16" ; do
 for fadd in "0" "1"; do
-$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=1   -n=10547
+# 0: for no specific RMSNorm; 1: for T-5 like RMSNorm
+for s in "0" "1"; do
+$EXE -prec_i=$pr_i -fadd=$fadd -s=$s $fquant -m=1   -n=10547
 #$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=3   -n=17134
 done
 done
 done
+done
--- a/example/ck_tile/11_add_rmsnorm2d_rdquant/CMakeLists.txt
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/CMakeLists.txt
@@ -1,7 +1,7 @@
 set(TILE_ADD_RMSNORM2D_RDQUANT_FWD "tile_add_rmsnorm2d_rdquant_fwd")
 # not using add_example_executable() to add this target, since we don't want this to have
 # to be included in "make all/install/check"
-message("adding ${TILE_ADD_RMSNORM2D_RDQUANT_FWD}")
+message(DEBUG "adding ${TILE_ADD_RMSNORM2D_RDQUANT_FWD}")
 file(GLOB INSTANCE_SRCS instances/*.cpp)
 add_executable(${TILE_ADD_RMSNORM2D_RDQUANT_FWD} EXCLUDE_FROM_ALL add_rmsnorm2d_rdquant_fwd.cpp)
 target_include_directories(${TILE_ADD_RMSNORM2D_RDQUANT_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
--- a/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.cpp
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.cpp
@@ -67,13 +67,14 @@ bool run(const ck_tile::ArgParser& arg_parser)

    using TypeConfig = AddRmsnormRdquantTypeConfig<InputDataType, QuantizedDataType>;

-    using ADataType       = typename TypeConfig::ADataType;
-    using BDataType       = typename TypeConfig::BDataType;
-    using GammaDataType   = typename TypeConfig::GammaDataType;
-    using XDataType       = typename TypeConfig::XDataType;
-    using YScaleDataType  = typename TypeConfig::YScaleDataType;
-    using QYDataType      = typename TypeConfig::QYDataType;
-    using ComputeDataType = float;
+    using ADataType        = typename TypeConfig::ADataType;
+    using BDataType        = typename TypeConfig::BDataType;
+    using GammaDataType    = typename TypeConfig::GammaDataType;
+    using XDataType        = typename TypeConfig::XDataType;
+    using YScaleDataType   = typename TypeConfig::YScaleDataType;
+    using QYDataType       = typename TypeConfig::QYDataType;
+    using ComputeDataType  = float;
+    using UnquantYDataType = ck_tile::null_type;

    // host verify
    ck_tile::HostTensor<ADataType> a_host({m, n}, {stride, 1});
@@ -104,8 +105,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
    b_buf.ToDevice(b_host.data());
    gamma_buf.ToDevice(gamma_host.data());

-    std::cout << "[" << input_data_type << ", " << quantized_data_type << "]"
-              << " m:" << m << ", n:" << n << ", stride:" << stride << std::flush;
+    std::cout << "[" << input_data_type << ", " << quantized_data_type << "]" << " m:" << m
+              << ", n:" << n << ", stride:" << stride << std::flush;

    add_rmsnorm2d_rdquant_fwd_traits traits{input_data_type, quantized_data_type, SaveX};

@@ -184,6 +185,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
        // Rmsnorm2d
        {
            ck_tile::HostTensor<InvRmsDataType> invRms_host_ref({m});
+            ck_tile::HostTensor<UnquantYDataType> unquant_y_host_ref({m, n});

            // CAUSION: kernel use ComputeDataType version of x, but we use XDataType here for
            // simplicity
@@ -191,8 +193,9 @@ bool run(const ck_tile::ArgParser& arg_parser)
                                             GammaDataType,
                                             ComputeDataType,
                                             YDataType,
-                                             InvRmsDataType>(
-                x_host_ref, gamma_host, y_host, invRms_host_ref, epsilon);
+                                             InvRmsDataType,
+                                             UnquantYDataType>(
+                x_host_ref, gamma_host, y_host, invRms_host_ref, unquant_y_host_ref, epsilon);
        }

        // yscale
--- a/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.hpp
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.hpp
@@ -80,22 +80,23 @@ struct add_rmsnorm2d_rdquant_fwd_traits_
    using InputDataType     = ck_tile::remove_cvref_t<InputDataType_>;
    using QuantizedDataType = ck_tile::remove_cvref_t<QuantizedDataType_>;

-    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= warpSize;
-    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % warpSize == 0);
+    static constexpr auto WarpSize        = ck_tile::get_warp_size();
+    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= WarpSize;
+    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % WarpSize == 0);
    static constexpr ck_tile::index_t total_warps =
-        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / warpSize;
+        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / WarpSize;

    // num of warps along m
    static constexpr ck_tile::index_t BlockWarps_M = []() {
        if constexpr(is_warp_per_row)
        {
-            static_assert(warpSize % ThreadPerBlock_N_ == 0);
-            return total_warps * (warpSize / ThreadPerBlock_N_);
+            static_assert(WarpSize % ThreadPerBlock_N_ == 0);
+            return total_warps * (WarpSize / ThreadPerBlock_N_);
        }
        else
        {
-            // static_assert(warpSize % ThreadPerBlock_M_ == 0);
-            return total_warps / (ThreadPerBlock_N_ / warpSize);
+            // static_assert(WarpSize % ThreadPerBlock_M_ == 0);
+            return total_warps / (ThreadPerBlock_N_ / WarpSize);
        }
    }();

@@ -103,13 +104,13 @@ struct add_rmsnorm2d_rdquant_fwd_traits_
    static constexpr ck_tile::index_t BlockWarps_N = []() {
        if constexpr(is_warp_per_row)
        {
-            static_assert(warpSize % ThreadPerBlock_N_ == 0);
+            static_assert(WarpSize % ThreadPerBlock_N_ == 0);
            return 1;
        }
        else
        {
-            static_assert(ThreadPerBlock_N_ % warpSize == 0);
-            return ThreadPerBlock_N_ / warpSize;
+            static_assert(ThreadPerBlock_N_ % WarpSize == 0);
+            return ThreadPerBlock_N_ / WarpSize;
        }
    }();

--- a/example/ck_tile/11_add_rmsnorm2d_rdquant/example_add_rmsnorm2d_rdquant_fwd.cpp
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/example_add_rmsnorm2d_rdquant_fwd.cpp
@@ -186,7 +186,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
        // Rmsnorm2d
        {
            ck_tile::HostTensor<InvRmsDataType> invRms_host_ref({m});
-
+            ck_tile::HostTensor<ck_tile::null_type> unquant_y_host_ref({m, n});
            // CAUSION: kernel use ComputeDataType version of x, but we use XDataType here for
            // simplicity
            ck_tile::reference_rmsnorm2d_fwd<XDataType,
@@ -194,7 +194,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
                                             ComputeDataType,
                                             YDataType,
                                             InvRmsDataType>(
-                x_host_ref, gamma_host, y_host, invRms_host_ref, epsilon);
+                x_host_ref, gamma_host, y_host, invRms_host_ref, unquant_y_host_ref, epsilon);
        }

        // yscale
@@ -256,8 +256,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
            }
        }

-        std::cout << "[" << data_type << "]"
-                  << " m:" << m << ", n:" << n << ", stride:" << stride
+        std::cout << "[" << data_type << "]" << " m:" << m << ", n:" << n << ", stride:" << stride
                  << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl;
    }

--- a/example/ck_tile/12_smoothquant/CMakeLists.txt
+++ b/example/ck_tile/12_smoothquant/CMakeLists.txt
@@ -1,5 +1,5 @@
 function (add_smoothquant_example TARGET_NAME MAIN_SRC)
-    message("adding ${TARGET_NAME}")
+    message(DEBUG "adding ${TARGET_NAME}")
    # not using add_example_executable() to add target, since we don't want this to have
    # to be included in "make all/install/check"
    add_executable(${TARGET_NAME} EXCLUDE_FROM_ALL ${MAIN_SRC})
--- a/example/ck_tile/12_smoothquant/example_smoothquant.cpp
+++ b/example/ck_tile/12_smoothquant/example_smoothquant.cpp
@@ -216,10 +216,9 @@ bool run(const ck_tile::ArgParser& arg_parser)
            }
        }

-        std::cout << "[" << data_type << "]"
-                  << " m:" << m << ", n:" << n << ", x_stride:" << x_stride
-                  << ", y_stride:" << y_stride << ", valid:" << (pass ? "y" : "n") << std::flush
-                  << std::endl;
+        std::cout << "[" << data_type << "]" << " m:" << m << ", n:" << n
+                  << ", x_stride:" << x_stride << ", y_stride:" << y_stride
+                  << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl;
    }

    return pass;
--- a/example/ck_tile/12_smoothquant/smoothquant.cpp
+++ b/example/ck_tile/12_smoothquant/smoothquant.cpp
@@ -93,9 +93,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
    x_buf.ToDevice(x_host.data());
    smscale_buf.ToDevice(smscale_host.data());

-    std::cout << "[" << data_type << "]"
-              << " m:" << m << ", n:" << n << ", x_stride:" << x_stride << ", y_stride:" << y_stride
-              << std::flush;
+    std::cout << "[" << data_type << "]" << " m:" << m << ", n:" << n << ", x_stride:" << x_stride
+              << ", y_stride:" << y_stride << std::flush;

    smoothquant_traits traits{data_type};

--- a/example/ck_tile/12_smoothquant/smoothquant.hpp
+++ b/example/ck_tile/12_smoothquant/smoothquant.hpp
@@ -49,22 +49,22 @@ struct smoothquant_traits_
 {
    using DataType = ck_tile::remove_cvref_t<DataType_>;

-    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= warpSize;
-    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % warpSize == 0);
+    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= ck_tile::get_warp_size();
+    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % ck_tile::get_warp_size() == 0);
    static constexpr ck_tile::index_t total_warps =
-        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / warpSize;
+        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / ck_tile::get_warp_size();

    // num of warps along m
    static constexpr ck_tile::index_t BlockWarps_M = []() {
        if constexpr(is_warp_per_row)
        {
-            static_assert(warpSize % ThreadPerBlock_N_ == 0);
-            return total_warps * (warpSize / ThreadPerBlock_N_);
+            static_assert(ck_tile::get_warp_size() % ThreadPerBlock_N_ == 0);
+            return total_warps * (ck_tile::get_warp_size() / ThreadPerBlock_N_);
        }
        else
        {
-            // static_assert(warpSize % ThreadPerBlock_M_ == 0);
-            return total_warps / (ThreadPerBlock_N_ / warpSize);
+            // static_assert(ck_tile::get_warp_size() % ThreadPerBlock_M_ == 0);
+            return total_warps / (ThreadPerBlock_N_ / ck_tile::get_warp_size());
        }
    }();

@@ -72,13 +72,13 @@ struct smoothquant_traits_
    static constexpr ck_tile::index_t BlockWarps_N = []() {
        if constexpr(is_warp_per_row)
        {
-            static_assert(warpSize % ThreadPerBlock_N_ == 0);
+            static_assert(ck_tile::get_warp_size() % ThreadPerBlock_N_ == 0);
            return 1;
        }
        else
        {
-            static_assert(ThreadPerBlock_N_ % warpSize == 0);
-            return ThreadPerBlock_N_ / warpSize;
+            static_assert(ThreadPerBlock_N_ % ck_tile::get_warp_size() == 0);
+            return ThreadPerBlock_N_ / ck_tile::get_warp_size();
        }
    }();

--- a/example/ck_tile/13_moe_sorting/README.md
+++ b/example/ck_tile/13_moe_sorting/README.md
@@ -14,14 +14,24 @@ This will result in an executable `build/bin/tile_example_moe_sorting`
 ## example
 ```
 args:
-          -v    weather do CPU validation or not (default:1)
-       -pr_i    index data type. (currently only fp32 supported now) (default:int32)
-       -pr_w    output weight data type(currently only fp32 supported now) (default:fp32)
-          -t    number of input tokens (default:32)
-          -e    number of experts (default:8)
-          -k    topk (default:2)
-       -st_i    row stride of input, -1 means same as experts (default:-1)
-       -seed    seed to be used, -1 means random every time (default:-1)
-      -kname    when set to 1 it will print kernel name (default:0)
+           -v    turn CPU validation on (1) or off (0). (default:1)
+        -pr_i    index data type.  Only int32 is currently supported. (default:int32)
+        -pr_w    output weight data type. Only fp32 is currently supported. (default:fp32)
+           -t    number of input tokens. (default:128)
+                 If "local_t" presents, this value indicates global concurrency of all ranks.
+     -local_t    Number of local input tokens for curent rank. (default:-1)
+                 This value must be within range "[0, t)", or "-1"(no such feature)
+                 This feature is to simulate EP case where where each rank has different tokens.
+                 Besides, this value will be stored in a GPU buffer, which is friendly for CUDA graph.
+           -e    number of num_experts (default:8)
+           -k    topk (default:4)
+        -unit    unit_size (default:32)
+-moe_buf_size    moe_buf_size (default:0)
+   -local_eid    a list of experts enabled as local expert. e.g. "0,1,4,5" (default:-1)
+                 please make sure eid is in ascending order!
+        -seed    seed to be used. When set to -1, a random seed will be generated each time invoking this example (default:-1)
+       -kname    prints the kernel name when set to 1 (default:0)
+      -warmup    number of iterations before benchmark the kernel (default:5)
+      -repeat    number of iterations to benchmark the kernel (default:20)

 ```
--- a/example/ck_tile/13_moe_sorting/moe_sorting.cpp
+++ b/example/ck_tile/13_moe_sorting/moe_sorting.cpp
@@ -18,20 +18,46 @@
 auto create_args(int argc, char* argv[])
 {
    ck_tile::ArgParser arg_parser;
-    arg_parser.insert("v", "1", "weather do CPU validation or not")
-        .insert("pr_i", "int32", "index data type. (currently only int32 supported now)")
-        .insert("pr_w", "fp32", "output weight data type(currently only fp32 supported now)")
-        .insert("t", "128", "number of input tokens")
+    arg_parser.insert("v", "1", "turn CPU validation on (1) or off (0).")
+        .insert("pr_i", "int32", "index data type.  Only int32 is currently supported.")
+        .insert("pr_w", "fp32", "output weight data type. Only fp32 is currently supported.")
+        .insert("t",
+                "128",
+                "number of input tokens.\n"
+                "If \"local_t\" presents, this value indicates global concurrency of all ranks.")
+        .insert(
+            "local_t",
+            "-1",
+            "Number of local input tokens for curent rank.\n"
+            "This value must be within range \"[0, t)\", or \"-1\"(no such feature)\n"
+            "This feature is to simulate EP case where where each rank has different tokens.\n"
+            "Besides, this value will be stored in a GPU buffer, which is friendly for CUDA graph.")
        .insert("e", "8", "number of num_experts")
        .insert("k", "4", "topk")
        .insert("unit", "32", "unit_size")
+#if MOE_SORTING_FMOE_2D_BUF
+        .insert("moe_buf_interm_dim", "0", "interm_dim(col) of the following fmoe buf")
+        .insert(
+            "moe_buf_elem_bytes", "2", "fmoe buf element byte size, 1:8bit, 2:16bit, 4:32bit...")
+#else
        .insert("moe_buf_size", "0", "moe_buf_size")
+#endif
+        .insert("ci",
+                "1",
+                "clear workspace inside API or not(if \"0\", require manually clear outside)")
+        .insert(
+            "dispatch",
+            "0",
+            "dispatch policy. 0:automatically pick up kernel, 1:use single kernel, 2:use mp kernel")
        .insert("local_eid",
                "-1",
                "a list of experts enabled as local expert. e.g. \"0,1,4,5\"\n"
                "please make sure eid is in ascending order!")
-        .insert("seed", "-1", "seed to be used, -1 means random every time")
-        .insert("kname", "0", "when set to 1 it will print kernel name")
+        .insert("seed",
+                "-1",
+                "seed to be used. When set to -1, a random seed will be generated each time "
+                "invoking this example")
+        .insert("kname", "0", "prints the kernel name when set to 1")
        .insert("warmup", "5", "number of iterations before benchmark the kernel")
        .insert("repeat", "20", "number of iterations to benchmark the kernel");

@@ -70,14 +96,22 @@ bool test_moe_sorting(ck_tile::ArgParser args)
    std::string index_prec  = args.get_str("pr_i");
    std::string weight_prec = args.get_str("pr_w");
    int tokens              = args.get_int("t");
+    int local_tokens        = args.get_int("local_t");
    int num_experts         = args.get_int("e");
    int topk                = args.get_int("k");
    int seed                = args.get_int("seed");
    int unit_size           = args.get_int("unit");
-    int moe_buf_size        = args.get_int("moe_buf_size");
-    int kname               = args.get_int("kname");
-    int warmup              = args.get_int("warmup");
-    int repeat              = args.get_int("repeat");
+#if MOE_SORTING_FMOE_2D_BUF
+    int moe_buf_interm_dim = args.get_int("moe_buf_interm_dim");
+    int moe_buf_elem_bytes = args.get_int("moe_buf_elem_bytes");
+#else
+    int64_t moe_buf_size = static_cast<int64_t>(args.get_uint64("moe_buf_size"));
+#endif
+    int kname           = args.get_int("kname");
+    int warmup          = args.get_int("warmup");
+    int repeat          = args.get_int("repeat");
+    bool clear_inside   = args.get_int("ci") != 0;
+    int dispatch_policy = args.get_int("dispatch");

    int max_output_ids =
        ck_tile::integer_least_multiple(topk * tokens + num_experts * unit_size - topk, unit_size);
@@ -95,6 +129,16 @@ bool test_moe_sorting(ck_tile::ArgParser args)
        return false;
    }

+    // if local_tokens == tokens, not local_token, but better avoid this since no meaning for such
+    // case
+    bool is_local_token = local_tokens >= 0 && local_tokens < tokens;
+
+    if(local_tokens > tokens)
+    {
+        printf("local_tokens:%d larger than tokens:%d, invalid\n", local_tokens, tokens);
+        return false;
+    }
+
    bool local_expert_masking      = args.get_str("local_eid") != "-1";
    auto local_expert_masking_host = [&]() {
        if(local_expert_masking)
@@ -125,11 +169,26 @@ bool test_moe_sorting(ck_tile::ArgParser args)
    ck_tile::HostTensor<IndexType> sorted_ids_host({max_output_ids}, {1});
    ck_tile::HostTensor<WeightType> sorted_weights_host({max_output_ids}, {1});
    ck_tile::HostTensor<IndexType> sorted_expert_ids_host({max_output_ids / unit_size}, {1});
-    ck_tile::HostTensor<IndexType> sorted_id_cnt_host({1}, {1});
+    // for simplicity, below buffer allocate 2 dword
+    ck_tile::HostTensor<IndexType> sorted_id_cnt_host({2}, {1});
+#if MOE_SORTING_FMOE_2D_BUF
+    ck_tile::HostTensor<int8_t> moe_buf_host(
+        {static_cast<std::size_t>(is_local_token ? local_tokens : tokens) * moe_buf_interm_dim *
+         moe_buf_elem_bytes});
+    auto moe_buf_bytes = moe_buf_interm_dim == 0 ? static_cast<std::size_t>(0)
+                                                 : moe_buf_host.get_element_space_size_in_bytes();
+#else
    ck_tile::HostTensor<float> moe_buf_host({moe_buf_size});
+    auto moe_buf_bytes = moe_buf_size == 0 ? static_cast<std::size_t>(0)
+                                           : moe_buf_host.get_element_space_size_in_bytes();
+#endif

    ck_tile::FillUniformDistribution<WeightType>{-.5f, .5f}(weights_host);
+#if MOE_SORTING_FMOE_2D_BUF
+    ck_tile::FillUniformDistribution<int8_t>{-.5f, .5f}(moe_buf_host);
+#else
    ck_tile::FillUniformDistribution<WeightType>{-.5f, .5f}(moe_buf_host);
+#endif
    topid_unique_gen<IndexType>(topk_ids_host.mData, tokens, topk, num_experts, seed);

    ck_tile::DeviceMem topk_ids_dev(topk_ids_host.get_element_space_size_in_bytes());
@@ -143,9 +202,16 @@ bool test_moe_sorting(ck_tile::ArgParser args)
    ck_tile::DeviceMem local_expert_masking_dev(
        local_expert_masking_host.get_element_space_size_in_bytes());

+    // used for simulating dynamic_tokens for EP case
+    ck_tile::DeviceMem local_tokens_dev(sizeof(ck_tile::index_t));
+    if(is_local_token)
+    {
+        local_tokens_dev.ToDevice(&local_tokens);
+    }
+
    topk_ids_dev.ToDevice(topk_ids_host.data());
    weights_dev.ToDevice(weights_host.data());
-    if(moe_buf_size > 0)
+    if(moe_buf_bytes > 0)
    {
        moe_buf_dev.ToDevice(moe_buf_host.data());
    }
@@ -153,29 +219,37 @@ bool test_moe_sorting(ck_tile::ArgParser args)
        local_expert_masking_dev.ToDevice(local_expert_masking_host.data());

    // if return zero, means no need workspace, can set moe_sorting_args.p_ws to nullptr
-    ck_tile::index_t workspace_size = moe_sorting_get_workspace_size(tokens, num_experts);
+    ck_tile::index_t workspace_size =
+        moe_sorting_get_workspace_size(tokens, num_experts, topk, dispatch_policy);
    ck_tile::DeviceMem moe_sorting_ws(workspace_size != 0 ? workspace_size : 0);
-
-    if(workspace_size != 0)
+    if(workspace_size != 0 && clear_inside == false)
        moe_sorting_ws.SetZero(); // note, clear here!!!!

-    moe_sorting_trait trait{index_prec, weight_prec, local_expert_masking};
+    moe_sorting_trait trait{
+        index_prec, weight_prec, local_expert_masking, clear_inside, dispatch_policy};

    moe_sorting_args karg{topk_ids_dev.GetDeviceBuffer(),
                          weights_dev.GetDeviceBuffer(),
                          local_expert_masking ? local_expert_masking_dev.GetDeviceBuffer()
                                               : nullptr,
+                          is_local_token ? local_tokens_dev.GetDeviceBuffer() : nullptr,
                          sorted_ids_dev.GetDeviceBuffer(),
                          sorted_weights_dev.GetDeviceBuffer(),
                          sorted_expert_ids_dev.GetDeviceBuffer(),
                          sorted_id_cnt_dev.GetDeviceBuffer(),
-                          moe_buf_size > 0 ? moe_buf_dev.GetDeviceBuffer() : nullptr,
+                          moe_buf_bytes > 0 ? moe_buf_dev.GetDeviceBuffer() : nullptr,
                          workspace_size != 0 ? moe_sorting_ws.GetDeviceBuffer() : nullptr,
                          tokens,
                          unit_size,
                          num_experts,
                          topk,
-                          static_cast<ck_tile::index_t>(moe_buf_size * sizeof(float))};
+#if MOE_SORTING_FMOE_2D_BUF
+                          moe_buf_interm_dim,
+                          moe_buf_elem_bytes
+#else
+                          static_cast<ck_tile::long_index_t>(moe_buf_size * sizeof(float))
+#endif
+    };

    ck_tile::stream_config sc{nullptr,
                              true,
@@ -188,7 +262,7 @@ bool test_moe_sorting(ck_tile::ArgParser args)

 #if 0
    {
-        ck_tile::HostTensor<char> ws_host({workspace_size}, {1});
+    ck_tile::HostTensor<char> ws_host({workspace_size}, {1});
        moe_sorting_ws.FromDevice(ws_host.data());

        int * p_mesh = reinterpret_cast<int*>(ws_host.data());
@@ -237,19 +311,36 @@ bool test_moe_sorting(ck_tile::ArgParser args)
    }
 #endif

-    printf("[%s|%s]tokens:%d, num_experts:%d, topk:%d, mp:%d, ",
+    printf("[%s|%s|%s|%d]tokens:%d",
           index_prec.c_str(),
           weight_prec.c_str(),
-           tokens,
-           num_experts,
-           topk,
-           workspace_size != 0 ? 1 : 0);
+           workspace_size == 0 ? "cx" : (clear_inside ? "ci" : "co"),
+           dispatch_policy,
+           tokens);
+    if(is_local_token)
+    {
+        printf("(%d)", local_tokens);
+    }
+    printf(", num_experts:%d, topk:%d, mp:%d, ", num_experts, topk, workspace_size != 0 ? 1 : 0);

    if(local_expert_masking)
    {
        printf("local_eid:%s, ", args.get_str("local_eid").c_str());
    }

+    if(moe_buf_bytes > 0)
+    {
+#if MOE_SORTING_FMOE_2D_BUF
+        printf("moe_buf:%lu(%d,%d), ",
+               static_cast<uint64_t>(moe_buf_bytes),
+               moe_buf_interm_dim,
+               moe_buf_elem_bytes);
+#else
+
+        printf("moe_buf:%lu, ", static_cast<uint64_t>(moe_buf_bytes));
+#endif
+    }
+
    if(ms < 0)
        printf("not supported\n");
    else
@@ -264,7 +355,7 @@ bool test_moe_sorting(ck_tile::ArgParser args)
    sorted_weights_dev.FromDevice(sorted_weights_host.data());
    sorted_expert_ids_dev.FromDevice(sorted_expert_ids_host.data());
    sorted_id_cnt_dev.FromDevice(sorted_id_cnt_host.data());
-    if(moe_buf_size > 0)
+    if(moe_buf_bytes > 0)
    {
        moe_buf_dev.FromDevice(moe_buf_host.data());
    }
@@ -286,6 +377,8 @@ bool test_moe_sorting(ck_tile::ArgParser args)
                                                              ref_total_tokens_post_pad,
                                                              num_experts,
                                                              unit_size,
+                                                              is_local_token ? local_tokens
+                                                                             : tokens,
                                                              local_expert_masking);
        printf("total_tokens_post_pad:%d(%d), ",
               ref_total_tokens_post_pad,
@@ -308,6 +401,16 @@ bool test_moe_sorting(ck_tile::ArgParser args)
                                      std::string("OUT Error: Incorrect eid!"),
                                      1e-6,
                                      1e-6);
+            // if(is_local_token)
+            {
+                auto t_ = is_local_token ? local_tokens : tokens;
+                bool _f = t_ == sorted_id_cnt_host.mData[1];
+                rtn &= _f;
+                if(!_f)
+                {
+                    printf("not equal token buffer pad %d(%d)\n", t_, sorted_id_cnt_host.mData[1]);
+                }
+            }
        }
        else
        {
@@ -315,9 +418,13 @@ bool test_moe_sorting(ck_tile::ArgParser args)
            rtn = false;
        }

-        if(moe_buf_size)
+        if(moe_buf_bytes)
        {
+#if MOE_SORTING_FMOE_2D_BUF
+            ck_tile::HostTensor<int8_t> moe_buf_ref({moe_buf_bytes});
+#else
            ck_tile::HostTensor<WeightType> moe_buf_ref({moe_buf_size});
+#endif
            rtn &= ck_tile::check_err(
                moe_buf_host, moe_buf_ref, std::string("OUT Error: Incorrect zero buf!"), 0, 0);
        }
@@ -335,16 +442,26 @@ bool test_moe_sorting(ck_tile::ArgParser args)

 int main(int argc, char** argv)
 {
-    auto [result, args] = create_args(argc, argv);
-    if(!result)
-        return -1;
-    std::string index_prec  = args.get_str("pr_i");
-    std::string weight_prec = args.get_str("pr_w");
-
-    bool r = true;
-    if(weight_prec.compare("fp32") == 0 && index_prec.compare("int32") == 0)
+    try
    {
-        r &= test_moe_sorting<float, ck_tile::index_t>(args);
+        auto [result, args] = create_args(argc, argv);
+        if(!result)
+            return -1;
+
+        std::string index_prec  = args.get_str("pr_i");
+        std::string weight_prec = args.get_str("pr_w");
+
+        bool r = true;
+        if(weight_prec == "fp32" && index_prec == "int32")
+        {
+            r &= test_moe_sorting<float, ck_tile::index_t>(args);
+        }
+
+        return r ? 0 : -1;
+    }
+    catch(const std::runtime_error& e)
+    {
+        std::cerr << "Runtime error: " << e.what() << '\n';
+        return EXIT_FAILURE;
    }
-    return r ? 0 : -1;
 }
--- a/example/ck_tile/13_moe_sorting/moe_sorting_api.cpp
+++ b/example/ck_tile/13_moe_sorting/moe_sorting_api.cpp
@@ -7,6 +7,14 @@
 #define MOE_SORTING_USE_EX_KERNEL 1
 #endif

+#ifndef MOE_SORTING_SUPPORT_LARGE_EXPERT
+#define MOE_SORTING_SUPPORT_LARGE_EXPERT 0
+#endif
+
+#ifndef MOE_SORTING_SUPPORT_LARGE_TOPK
+#define MOE_SORTING_SUPPORT_LARGE_TOPK 0
+#endif
+
 #if !MOE_SORTING_USE_EX_KERNEL

 #define MOE_SORTING_DISPATCH_ETILE(unroll_num_, expert_tile_)                         \
@@ -25,15 +33,18 @@

 #else

-#define MOE_SORTING_DISPATCH_(sub_token_tile_, sub_token_onshot_, local_expert_masking_)                \
+#define MOE_SORTING_DISPATCH_(                                                                          \
+    sub_token_tile_, sub_token_onshot_, local_expert_masking_, local_token_)                            \
    constexpr ck_tile::index_t sub_token_tile = sub_token_tile_;                                        \
    constexpr bool sub_token_onshot           = sub_token_onshot_;                                      \
    constexpr bool local_expert_masking       = local_expert_masking_;                                  \
+    constexpr bool local_token                = local_token_;                                           \
    using ms_problem                          = ck_tile::MoeSortingProblemEx<index_t,                   \
-                                                    ms_weight_type,            \
-                                                    sub_token_tile,            \
-                                                    sub_token_onshot,          \
-                                                    local_expert_masking>;     \
+                                                                             ms_weight_type,            \
+                                                                             sub_token_tile,            \
+                                                                             sub_token_onshot,          \
+                                                                             local_expert_masking,      \
+                                                                             local_token>;              \
    using kernel                              = ck_tile::MoeSortingKernel<ms_problem>;                  \
    auto kargs                                = kernel::MakeKargs(a);                                   \
    const dim3 grids                          = kernel::GridSize(a);                                    \
@@ -43,32 +54,43 @@
        s, ck_tile::make_kernel(kernel{}, grids, blocks, lds_bytes, kargs)); \
    return ave_time;

-#define MOE_SORTING_DISPATCH_SUB_TOKEN_(row_, sub_token_onshot_, local_expert_masking_) \
-    if(row_ % 8 == 0)                                                                   \
-    {                                                                                   \
-        MOE_SORTING_DISPATCH_(8, sub_token_onshot_, local_expert_masking_);             \
-    }                                                                                   \
-    else if(row_ % 4 == 0)                                                              \
-    {                                                                                   \
-        MOE_SORTING_DISPATCH_(4, sub_token_onshot_, local_expert_masking_);             \
-    }                                                                                   \
-    else if(row_ % 2 == 0)                                                              \
-    {                                                                                   \
-        MOE_SORTING_DISPATCH_(2, sub_token_onshot_, local_expert_masking_);             \
-    }                                                                                   \
-    else                                                                                \
-    {                                                                                   \
-        MOE_SORTING_DISPATCH_(1, sub_token_onshot_, local_expert_masking_);             \
+#define MOE_SORTING_DISPATCH_SUB_TOKEN_(                                                  \
+    row_, sub_token_onshot_, local_expert_masking_, local_token_)                         \
+    if(row_ % 8 == 0)                                                                     \
+    {                                                                                     \
+        MOE_SORTING_DISPATCH_(8, sub_token_onshot_, local_expert_masking_, local_token_); \
+    }                                                                                     \
+    else if(row_ % 4 == 0)                                                                \
+    {                                                                                     \
+        MOE_SORTING_DISPATCH_(4, sub_token_onshot_, local_expert_masking_, local_token_); \
+    }                                                                                     \
+    else if(row_ % 2 == 0)                                                                \
+    {                                                                                     \
+        MOE_SORTING_DISPATCH_(2, sub_token_onshot_, local_expert_masking_, local_token_); \
+    }                                                                                     \
+    else                                                                                  \
+    {                                                                                     \
+        MOE_SORTING_DISPATCH_(1, sub_token_onshot_, local_expert_masking_, local_token_); \
    }

-#define MOE_SORTING_DISPATCH_SUBTO_(row_, local_expert_masking_)            \
-    if(is_sub_token_onshot)                                                 \
-    {                                                                       \
-        MOE_SORTING_DISPATCH_SUB_TOKEN_(row_, true, local_expert_masking_)  \
-    }                                                                       \
-    else                                                                    \
-    {                                                                       \
-        MOE_SORTING_DISPATCH_SUB_TOKEN_(row_, false, local_expert_masking_) \
+#define MOE_SORTING_DISPATCH_DYNAMIC_TOKEN_(row_, sub_token_onshot_, local_expert_masking_)    \
+    if(is_local_token)                                                                         \
+    {                                                                                          \
+        MOE_SORTING_DISPATCH_SUB_TOKEN_(row_, sub_token_onshot_, local_expert_masking_, true)  \
+    }                                                                                          \
+    else                                                                                       \
+    {                                                                                          \
+        MOE_SORTING_DISPATCH_SUB_TOKEN_(row_, sub_token_onshot_, local_expert_masking_, false) \
+    }
+
+#define MOE_SORTING_DISPATCH_SUBTO_(row_, local_expert_masking_)                \
+    if(is_sub_token_onshot)                                                     \
+    {                                                                           \
+        MOE_SORTING_DISPATCH_DYNAMIC_TOKEN_(row_, true, local_expert_masking_)  \
+    }                                                                           \
+    else                                                                        \
+    {                                                                           \
+        MOE_SORTING_DISPATCH_DYNAMIC_TOKEN_(row_, false, local_expert_masking_) \
    }

 #define MOE_SORTING_DISPATCH_EMASK_(row_)        \
@@ -153,7 +175,7 @@ float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_confi
        }
        }
 #else
-        if(moe_sorting_get_workspace_size(a.tokens, a.num_experts) != 0)
+        if(moe_sorting_get_workspace_size(a.tokens, a.num_experts, a.topk, t.dispatch_policy) != 0)
        {
            return moe_sorting_mp(t, a, s);
        }
@@ -163,6 +185,7 @@ float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_confi
        auto row_                    = sub_token_ / 8;
        bool is_sub_token_onshot     = a.tokens <= sub_token_;
        bool is_local_expert_masking = t.local_expert_masking;
+        bool is_local_token          = a.p_local_tokens != nullptr;

        MOE_SORTING_DISPATCH_EMASK_(row_);
        // MOE_SORTING_DISPATCH_ETILE(0, 0);
@@ -171,88 +194,251 @@ float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_confi
    return -1;
 }

-#define MOE_SORTING_MP_0(unroll_num_, expert_masking_)                                            \
-    [&]() {                                                                                       \
-        constexpr ck_tile::index_t unroll_num = unroll_num_;                                      \
-        constexpr bool expert_masking         = expert_masking_;                                  \
-        using ms_problem =                                                                        \
-            ck_tile::MoeSortingProblemMp<ms_index_t, ms_weight_type, unroll_num, expert_masking>; \
-        using kernel      = ck_tile::MoeSortingMultiPhaseKernel_P0<ms_problem>;                   \
-        auto kargs        = kernel::MakeKargs(a);                                                 \
-        const dim3 grids  = kernel::GridSize(a);                                                  \
-        const dim3 blocks = kernel::BlockSize(a);                                                 \
-        return ck_tile::make_kernel(kernel{}, grids, blocks, 0, kargs);                           \
+#define MOE_SORTING_MP_0(mesh_type_, unroll_num_, expert_masking_, local_token_)                    \
+    [&]() {                                                                                         \
+        constexpr ck_tile::index_t unroll_num = unroll_num_;                                        \
+        constexpr bool expert_masking         = expert_masking_;                                    \
+        constexpr bool local_token            = local_token_;                                       \
+        using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,            \
+                                                                             ms_weight_type,        \
+                                                                             mesh_type_,            \
+                                                                             unroll_num,            \
+                                                                             expert_masking,        \
+                                                                             local_token>;          \
+        using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P0<ms_problem>; \
+        auto kargs                            = kernel::MakeKargs(a);                               \
+        const dim3 grids                      = kernel::GridSize(a);                                \
+        const dim3 blocks                     = kernel::BlockSize(a);                               \
+        return ck_tile::make_kernel<kernel::BLOCK_SIZE>(kernel{}, grids, blocks, 0, kargs);         \
    }()

-#define MOE_SORTING_MP_1(unroll_num_, expert_masking_)                                            \
-    [&]() {                                                                                       \
-        constexpr ck_tile::index_t unroll_num = unroll_num_;                                      \
-        constexpr bool expert_masking         = expert_masking_;                                  \
-        using ms_problem =                                                                        \
-            ck_tile::MoeSortingProblemMp<ms_index_t, ms_weight_type, unroll_num, expert_masking>; \
-        using kernel      = ck_tile::MoeSortingMultiPhaseKernel_P1<ms_problem>;                   \
-        auto kargs        = kernel::MakeKargs(a);                                                 \
-        const dim3 grids  = kernel::GridSize(a);                                                  \
-        const dim3 blocks = kernel::BlockSize(a);                                                 \
-        return ck_tile::make_kernel(kernel{}, grids, blocks, 0, kargs);                           \
+#define MOE_SORTING_MP_1(mesh_type_, unroll_num_, expert_masking_, local_token_)                    \
+    [&]() {                                                                                         \
+        constexpr ck_tile::index_t unroll_num = unroll_num_;                                        \
+        constexpr bool expert_masking         = expert_masking_;                                    \
+        constexpr bool local_token            = local_token_;                                       \
+        using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,            \
+                                                                             ms_weight_type,        \
+                                                                             mesh_type_,            \
+                                                                             unroll_num,            \
+                                                                             expert_masking,        \
+                                                                             local_token>;          \
+        using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P1<ms_problem>; \
+        auto kargs                            = kernel::MakeKargs(a);                               \
+        const dim3 grids                      = kernel::GridSize(a);                                \
+        const dim3 blocks                     = kernel::BlockSize(a);                               \
+        return ck_tile::make_kernel<kernel::BLOCK_SIZE>(kernel{}, grids, blocks, 0, kargs);         \
+    }()
+#if MOE_SORTING_SUPPORT_LARGE_EXPERT
+#define MOE_SORTING_MP_2(mesh_type_, unroll_num_, expert_masking_, local_token_)                    \
+    [&]() {                                                                                         \
+        constexpr ck_tile::index_t unroll_num = unroll_num_;                                        \
+        constexpr bool expert_masking         = expert_masking_;                                    \
+        constexpr bool local_token            = local_token_;                                       \
+        using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,            \
+                                                                             ms_weight_type,        \
+                                                                             mesh_type_,            \
+                                                                             unroll_num,            \
+                                                                             expert_masking,        \
+                                                                             local_token>;          \
+        using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P2<ms_problem>; \
+        auto kargs                            = kernel::MakeKargs(a);                               \
+        const dim3 grids                      = kernel::GridSize(a);                                \
+        const dim3 blocks                     = kernel::BlockSize(a);                               \
+        return ck_tile::make_kernel(kernel{}, grids, blocks, 0, kargs);                             \
    }()

-#define MOE_SORTING_MP_2(unroll_num_, expert_masking_)                                            \
-    [&]() {                                                                                       \
-        constexpr ck_tile::index_t unroll_num = unroll_num_;                                      \
-        constexpr bool expert_masking         = expert_masking_;                                  \
-        using ms_problem =                                                                        \
-            ck_tile::MoeSortingProblemMp<ms_index_t, ms_weight_type, unroll_num, expert_masking>; \
-        using kernel      = ck_tile::MoeSortingMultiPhaseKernel_P2<ms_problem>;                   \
-        auto kargs        = kernel::MakeKargs(a);                                                 \
-        const dim3 grids  = kernel::GridSize(a);                                                  \
-        const dim3 blocks = kernel::BlockSize(a);                                                 \
-        return ck_tile::make_kernel(kernel{}, grids, blocks, 0, kargs);                           \
+#define MOE_SORTING_MP_3(mesh_type_, unroll_num_, expert_masking_, local_token_)                    \
+    [&]() {                                                                                         \
+        constexpr ck_tile::index_t unroll_num = unroll_num_;                                        \
+        constexpr bool expert_masking         = expert_masking_;                                    \
+        constexpr bool local_token            = local_token_;                                       \
+        using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,            \
+                                                                             ms_weight_type,        \
+                                                                             mesh_type_,            \
+                                                                             unroll_num,            \
+                                                                             expert_masking,        \
+                                                                             local_token>;          \
+        using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P3<ms_problem>; \
+        auto kargs                            = kernel::MakeKargs(a);                               \
+        const dim3 grids                      = kernel::GridSize(a);                                \
+        const dim3 blocks                     = kernel::BlockSize(a);                               \
+        return ck_tile::make_kernel(kernel{}, grids, blocks, 0, kargs);                             \
+    }()
+#endif
+
+#define MOE_SORTING_MP_23(mesh_type_, unroll_num_, expert_masking_, local_token_)                    \
+    [&]() {                                                                                          \
+        constexpr ck_tile::index_t unroll_num = unroll_num_;                                         \
+        constexpr bool expert_masking         = expert_masking_;                                     \
+        constexpr bool local_token            = local_token_;                                        \
+        using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,             \
+                                                                             ms_weight_type,         \
+                                                                             mesh_type_,             \
+                                                                             unroll_num,             \
+                                                                             expert_masking,         \
+                                                                             local_token>;           \
+        using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P23<ms_problem>; \
+        auto kargs                            = kernel::MakeKargs(a);                                \
+        const dim3 grids                      = kernel::GridSize(a);                                 \
+        const dim3 blocks                     = kernel::BlockSize(a);                                \
+        const auto lds_size                   = kernel::GetSmemSize(a);                              \
+        return ck_tile::make_kernel<kernel::BLOCK_SIZE>(kernel{}, grids, blocks, lds_size, kargs);   \
    }()

-#define MOE_SORTING_MP_3(unroll_num_, expert_masking_)                                            \
-    [&]() {                                                                                       \
-        constexpr ck_tile::index_t unroll_num = unroll_num_;                                      \
-        constexpr bool expert_masking         = expert_masking_;                                  \
-        using ms_problem =                                                                        \
-            ck_tile::MoeSortingProblemMp<ms_index_t, ms_weight_type, unroll_num, expert_masking>; \
-        using kernel      = ck_tile::MoeSortingMultiPhaseKernel_P3<ms_problem>;                   \
-        auto kargs        = kernel::MakeKargs(a);                                                 \
-        const dim3 grids  = kernel::GridSize(a);                                                  \
-        const dim3 blocks = kernel::BlockSize(a);                                                 \
-        return ck_tile::make_kernel(kernel{}, grids, blocks, 0, kargs);                           \
+#define MOR_SORTING_MP_DISPATCH_(mesh_type_, token_vec_0_, token_vec_1_, token_vec_23_)            \
+    if(t.local_expert_masking)                                                                     \
+    {                                                                                              \
+        if(is_local_token)                                                                         \
+        {                                                                                          \
+            float ave_time =                                                                       \
+                ck_tile::launch_kernel(s,                                                          \
+                                       maybe_clear_workspace,                                      \
+                                       MOE_SORTING_MP_0(mesh_type_, token_vec_0_, true, true),     \
+                                       MOE_SORTING_MP_1(mesh_type_, token_vec_1_, true, true),     \
+                                       MOE_SORTING_MP_23(mesh_type_, token_vec_23_, true, true));  \
+            return ave_time;                                                                       \
+        }                                                                                          \
+        else                                                                                       \
+        {                                                                                          \
+            float ave_time =                                                                       \
+                ck_tile::launch_kernel(s,                                                          \
+                                       maybe_clear_workspace,                                      \
+                                       MOE_SORTING_MP_0(mesh_type_, token_vec_0_, true, false),    \
+                                       MOE_SORTING_MP_1(mesh_type_, token_vec_1_, true, false),    \
+                                       MOE_SORTING_MP_23(mesh_type_, token_vec_23_, true, false)); \
+            return ave_time;                                                                       \
+        }                                                                                          \
+    }                                                                                              \
+    else                                                                                           \
+    {                                                                                              \
+        if(is_local_token)                                                                         \
+        {                                                                                          \
+            float ave_time =                                                                       \
+                ck_tile::launch_kernel(s,                                                          \
+                                       maybe_clear_workspace,                                      \
+                                       MOE_SORTING_MP_0(mesh_type_, token_vec_0_, false, true),    \
+                                       MOE_SORTING_MP_1(mesh_type_, token_vec_1_, false, true),    \
+                                       MOE_SORTING_MP_23(mesh_type_, token_vec_23_, false, true)); \
+            return ave_time;                                                                       \
+        }                                                                                          \
+        else                                                                                       \
+        {                                                                                          \
+            float ave_time = ck_tile::launch_kernel(                                               \
+                s,                                                                                 \
+                maybe_clear_workspace,                                                             \
+                MOE_SORTING_MP_0(mesh_type_, token_vec_0_, false, false),                          \
+                MOE_SORTING_MP_1(mesh_type_, token_vec_1_, false, false),                          \
+                MOE_SORTING_MP_23(mesh_type_, token_vec_23_, false, false));                       \
+            return ave_time;                                                                       \
+        }                                                                                          \
+    }
+
+#define MOR_SORTING_CLEAR_WS_DISPATCH_(is_local_token_, block_size_, occu_)                 \
+    [&]() {                                                                                 \
+        using problem_ =                                                                    \
+            ck_tile::MoeSortingClearWorkspaceProblem<is_local_token_, block_size_, occu_>;  \
+        using kernel      = ck_tile::MoeSortingClearWorkspaceKernel<problem_>;              \
+        auto kargs        = kernel::MakeKargs(a);                                           \
+        const dim3 grids  = kernel::GridSize(a);                                            \
+        const dim3 blocks = kernel::BlockSize(a);                                           \
+        return ck_tile::make_kernel<kernel::BLOCK_SIZE>(kernel{}, grids, blocks, 0, kargs); \
    }()

 float moe_sorting_mp(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_config s)
 {
+    bool is_local_token = a.p_local_tokens != nullptr;
    if(t.weight_type == "fp32" && t.index_type == "int32")
    {
        using ms_index_t     = ck_tile::index_t;
        using ms_weight_type = float;

-        if(t.local_expert_masking)
+        auto maybe_clear_workspace = [=](const ck_tile::stream_config& s_) {
+            if(t.clear_workspace_inside_api)
+            {
+                if(is_local_token)
+                {
+                    auto k = MOR_SORTING_CLEAR_WS_DISPATCH_(true, 1024, 1);
+                    k(s_);
+                }
+                else
+                {
+                    auto k = MOR_SORTING_CLEAR_WS_DISPATCH_(false, 1024, 1);
+                    k(s_);
+                }
+            }
+        };
+
+        if(ck_tile::impl::moe_sorting_get_smem_size_p23(a.num_experts) >
+           ck_tile::get_smem_capacity())
        {
-            float ave_time = ck_tile::launch_kernel(s,
-                                                    MOE_SORTING_MP_0(1, true),
-                                                    MOE_SORTING_MP_1(1, true),
-                                                    MOE_SORTING_MP_2(1, true),
-                                                    MOE_SORTING_MP_3(1, true));
-            return ave_time;
+#if MOE_SORTING_SUPPORT_LARGE_EXPERT
+            if(t.local_expert_masking)
+            {
+                float ave_time = ck_tile::launch_kernel(s,
+                                                        maybe_clear_workspace,
+                                                        MOE_SORTING_MP_0(ms_index_t, 1, true),
+                                                        MOE_SORTING_MP_1(ms_index_t, 1, true),
+                                                        MOE_SORTING_MP_2(ms_index_t, 1, true),
+                                                        MOE_SORTING_MP_3(ms_index_t, 1, true));
+                return ave_time;
+            }
+            else
+            {
+                float ave_time = ck_tile::launch_kernel(s,
+                                                        maybe_clear_workspace,
+                                                        MOE_SORTING_MP_0(ms_index_t, 1, false),
+                                                        MOE_SORTING_MP_1(ms_index_t, 1, false),
+                                                        MOE_SORTING_MP_2(ms_index_t, 1, false),
+                                                        MOE_SORTING_MP_3(ms_index_t, 1, false));
+                return ave_time;
+            }
+#else
+            printf("do not support large expert %d\n", a.num_experts);
+            return -1;
+#endif
        }
        else
        {
-            float ave_time = ck_tile::launch_kernel(s,
-                                                    MOE_SORTING_MP_0(1, false),
-                                                    MOE_SORTING_MP_1(1, false),
-                                                    MOE_SORTING_MP_2(1, false),
-                                                    MOE_SORTING_MP_3(1, false));
-            return ave_time;
+            ck_tile::index_t mesh_byte_size =
+                ck_tile::impl::moe_sorting_mesh_byte_size(a.tokens, a.num_experts, a.topk);
+            if(mesh_byte_size == 1)
+            {
+                if(a.tokens * a.topk % 4 == 0)
+                {
+                    MOR_SORTING_MP_DISPATCH_(uint8_t, 4, 16, 16)
+                }
+                else
+                {
+                    MOR_SORTING_MP_DISPATCH_(uint8_t, 1, 16, 16)
+                }
+            }
+            else if(mesh_byte_size == 2)
+            {
+#if MOE_SORTING_SUPPORT_LARGE_TOPK
+                if(a.tokens * a.topk % 4 == 0)
+                {
+                    MOR_SORTING_MP_DISPATCH_(uint16_t, 4, 8, 8)
+                }
+                else
+                {
+                    MOR_SORTING_MP_DISPATCH_(uint16_t, 1, 8, 8)
+                }
+#else
+                printf("do not support large topk %d\n", a.topk);
+                return -1;
+#endif
+            }
+            else
+            {
+                MOR_SORTING_MP_DISPATCH_(ck_tile::index_t, 1, 1, 1)
+            }
        }
    }
    return -1;
 }

-int moe_sorting_get_workspace_size(int tokens, int num_experts)
+int moe_sorting_get_workspace_size(int tokens, int num_experts, int topk, int dispatch_policy)
 {
-    return ck_tile::moe_sorting_get_workspace_size(tokens, num_experts);
+    return ck_tile::moe_sorting_get_workspace_size(tokens, num_experts, topk, dispatch_policy);
 }
--- a/example/ck_tile/13_moe_sorting/moe_sorting_api.hpp
+++ b/example/ck_tile/13_moe_sorting/moe_sorting_api.hpp
@@ -10,8 +10,14 @@
 struct moe_sorting_trait
 {
    std::string index_type;
-    std::string weight_type;   // currently always float
-    bool local_expert_masking; // if mask experts as local expert
+    std::string weight_type;         // currently always float
+    bool local_expert_masking;       // if mask experts as local expert
+    bool clear_workspace_inside_api; // if true, no need clear workspace outsize (will take care of
+                                     // it inside API)
+    int dispatch_policy; // 0 - let the API choose kernel for you. 1 - always use single kerenl. 2 -
+                         // always use mp kernel NOTE: moe_sorting_get_workspace_size() need use
+                         // same dispatch_policy value. it will be undefined behavior if ppl using
+                         // different value when get ws and call the kernel
 };

 struct moe_sorting_args : public ck_tile::MoeSortingHostArgs
@@ -22,6 +28,6 @@ struct moe_sorting_args : public ck_tile::MoeSortingHostArgs
 // if return non zero, means need workspace, you need to allocate a GPU buffer
 // and set to moe_sorting_args.p_ws
 // NOTE: workspace size are required to clear zero before use the API
-int moe_sorting_get_workspace_size(int tokens, int num_experts);
+int moe_sorting_get_workspace_size(int tokens, int num_experts, int topk, int dispatch_policy);
 float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_config s);
 float moe_sorting_mp(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_config s);
--- a/example/ck_tile/13_moe_sorting/script/smoke_test.sh
+++ b/example/ck_tile/13_moe_sorting/script/smoke_test.sh
@@ -1,7 +1,9 @@
 # #!/bin/sh

 EXE=./build/bin/tile_example_moe_sorting
+MOE_BUF="12"

+if [ "x$MOE_BUF" = "x1" ] ; then
 $EXE -t=80 -e=17 -moe_buf_size=16
 $EXE -t=111 -e=117 -moe_buf_size=4
 $EXE -t=1000 -e=55 -moe_buf_size=1024
@@ -26,3 +28,62 @@ $EXE -t=13 -e=64 -k=3 -local_eid=4,5,6,7,8,9,10,11
 $EXE -t=99 -e=33 -k=9 -local_eid=6,10,11,15,19
 $EXE -t=80 -e=99 -k=10 -local_eid=0,8,12,33
 $EXE -t=11 -e=256 -k=5 -local_eid=99,110,129
+$EXE -t=128 -e=128 -k=6 -moe_buf_size=163840
+$EXE -t=8192 -e=32 -k=5 -moe_buf_size=163840
+$EXE -t=8192 -e=32 -k=8 -moe_buf_size=163840
+$EXE -t=8192 -e=256 -k=5 -moe_buf_size=163840
+$EXE -t=8192 -e=256 -k=8 -moe_buf_size=163840
+$EXE -t=163840 -e=256 -k=8 -moe_buf_size=163840
+$EXE -t=12 -local_t=3 -e=256 -k=5 -local_eid=9,10,199,145
+$EXE -t=67 -local_t=9 -e=555 -k=5 -local_eid=19,23,24,25,26,99
+$EXE -t=99 -local_t=93 -e=121 -moe_buf_size=10244
+$EXE -t=536 -local_t=345 -e=802 -k=99
+$EXE -t=331 -local_t=39 -e=83 -k=33
+$EXE -t=765 -local_t=654 -e=783 -k=8
+$EXE -t=23 -local_t=9 -e=1 -k=1
+$EXE -t=7 -local_t=0 -e=89 -k=1 -local_eid=0,8,12,33
+$EXE -t=61 -local_t=0 -e=333 -k=99 -local_eid=0,8,12,33
+$EXE -t=133940 -local_t=111921 -e=256 -k=17 -moe_buf_size=133940
+else
+$EXE -t=80 -e=17 -moe_buf_interm_dim=16 -moe_buf_elem_bytes=4
+$EXE -t=111 -e=117 -moe_buf_interm_dim=4 -moe_buf_elem_bytes=4
+$EXE -t=1000 -e=55 -moe_buf_interm_dim=1024 -moe_buf_elem_bytes=1
+$EXE -t=99 -e=120  -moe_buf_interm_dim=10244 -moe_buf_elem_bytes=2
+$EXE -t=175 -e=64 -k=8
+$EXE -t=65 -e=8 -k=2
+$EXE -t=1 -e=25
+$EXE -t=31 -e=19 -k=15
+$EXE -t=81 -e=37 -k=7
+$EXE -t=23 -e=1 -k=1
+$EXE -t=127 -e=99 -k=19
+$EXE -t=71 -e=11 -k=11
+$EXE -t=1 -e=1 -k=1
+$EXE -t=99 -e=2 -k=1
+$EXE -t=333 -e=99 -k=13
+$EXE -t=11 -e=256 -k=5
+$EXE -t=64 -e=455 -k=8
+$EXE -t=777 -e=802 -k=99
+$EXE -t=4097 -e=906 -k=51
+$EXE -t=128 -e=32 -k=5 -local_t=6 -moe_buf_interm_dim=262144
+$EXE -t=13 -e=64 -k=3 -local_eid=4,5,6,7,8,9,10,11
+$EXE -t=99 -e=33 -k=9 -local_eid=6,10,11,15,19
+$EXE -t=80 -e=99 -k=10 -local_eid=0,8,12,33
+$EXE -t=11 -e=256 -k=5 -local_eid=99,110,129
+$EXE -t=128 -e=128 -k=6 -moe_buf_interm_dim=163840 -moe_buf_elem_bytes=1
+$EXE -t=8192 -e=32 -k=5 -local_t=11 -moe_buf_interm_dim=163840
+$EXE -t=8192 -e=32 -k=8 -local_t=12 -moe_buf_interm_dim=163840 -moe_buf_elem_bytes=1
+$EXE -t=8192 -e=256 -k=5 -local_t=13 -moe_buf_interm_dim=163840
+$EXE -t=8192 -e=256 -k=8 -local_t=8 -moe_buf_interm_dim=163840
+$EXE -t=163840 -e=256 -k=8 -local_t=4 -moe_buf_interm_dim=163840 -moe_buf_elem_bytes=4
+$EXE -t=12 -local_t=3 -e=256 -k=5 -local_eid=9,10,199,145
+$EXE -t=67 -local_t=9 -e=555 -k=5 -local_eid=19,23,24,25,26,99
+$EXE -t=99 -local_t=93 -e=121 -local_t=4 -moe_buf_interm_dim=10244
+$EXE -t=536 -local_t=345 -e=802 -k=99
+$EXE -t=331 -local_t=39 -e=83 -k=33
+$EXE -t=765 -local_t=654 -e=783 -k=8
+$EXE -t=23 -local_t=9 -e=1 -k=1
+$EXE -t=7 -local_t=0 -e=89 -k=1 -local_eid=0,8,12,33
+$EXE -t=61 -local_t=0 -e=333 -k=99 -local_eid=0,8,12,33
+$EXE -t=133940 -local_t=111921 -e=256 -k=17 -local_t=2 -moe_buf_interm_dim=133940 -moe_buf_elem_bytes=1
+
+fi
--- a/example/ck_tile/14_moe_smoothquant/CMakeLists.txt
+++ b/example/ck_tile/14_moe_smoothquant/CMakeLists.txt
@@ -1,5 +1,5 @@
 function (add_moe_smoothquant_example TARGET_NAME MAIN_SRC)
-    message("adding ${TARGET_NAME}")
+    message(DEBUG "adding ${TARGET_NAME}")
    # not using add_example_executable() to add target, since we don't want this to have
    # to be included in "make all/install/check"
    add_executable(${TARGET_NAME} EXCLUDE_FROM_ALL ${MAIN_SRC})
--- a/example/ck_tile/14_moe_smoothquant/moe_smoothquant.cpp
+++ b/example/ck_tile/14_moe_smoothquant/moe_smoothquant.cpp
@@ -124,9 +124,9 @@ bool run(const ck_tile::ArgParser& arg_parser)
    smscale_buf.ToDevice(smscale_host.data());
    topk_ids_buf.ToDevice(topk_ids_host.data());

-    std::cout << "[" << prec_i << "-" << prec_o << "]"
-              << " tokens:" << tokens << ", hidden_size:" << hidden_size << ", stride:" << stride
-              << ", experts:" << experts << ", topk:" << topk << std::flush;
+    std::cout << "[" << prec_i << "-" << prec_o << "]" << " tokens:" << tokens
+              << ", hidden_size:" << hidden_size << ", stride:" << stride << ", experts:" << experts
+              << ", topk:" << topk << std::flush;

    moe_smoothquant_traits traits{prec_i, prec_o};

--- a/example/ck_tile/14_moe_smoothquant/moe_smoothquant.hpp
+++ b/example/ck_tile/14_moe_smoothquant/moe_smoothquant.hpp
@@ -38,22 +38,22 @@ struct moe_smoothquant_traits_
    using InputType  = ck_tile::remove_cvref_t<InputType_>;
    using OutputType = ck_tile::remove_cvref_t<OutputType_>;

-    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= warpSize;
-    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % warpSize == 0);
+    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= ck_tile::get_warp_size();
+    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % ck_tile::get_warp_size() == 0);
    static constexpr ck_tile::index_t total_warps =
-        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / warpSize;
+        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / ck_tile::get_warp_size();

    // num of warps along m
    static constexpr ck_tile::index_t BlockWarps_M = []() {
        if constexpr(is_warp_per_row)
        {
-            static_assert(warpSize % ThreadPerBlock_N_ == 0);
-            return total_warps * (warpSize / ThreadPerBlock_N_);
+            static_assert(ck_tile::get_warp_size() % ThreadPerBlock_N_ == 0);
+            return total_warps * (ck_tile::get_warp_size() / ThreadPerBlock_N_);
        }
        else
        {
-            // static_assert(warpSize % ThreadPerBlock_M_ == 0);
-            return total_warps / (ThreadPerBlock_N_ / warpSize);
+            // static_assert(ck_tile::get_warp_size() % ThreadPerBlock_M_ == 0);
+            return total_warps / (ThreadPerBlock_N_ / ck_tile::get_warp_size());
        }
    }();

@@ -61,13 +61,13 @@ struct moe_smoothquant_traits_
    static constexpr ck_tile::index_t BlockWarps_N = []() {
        if constexpr(is_warp_per_row)
        {
-            static_assert(warpSize % ThreadPerBlock_N_ == 0);
+            static_assert(ck_tile::get_warp_size() % ThreadPerBlock_N_ == 0);
            return 1;
        }
        else
        {
-            static_assert(ThreadPerBlock_N_ % warpSize == 0);
-            return ThreadPerBlock_N_ / warpSize;
+            static_assert(ThreadPerBlock_N_ % ck_tile::get_warp_size() == 0);
+            return ThreadPerBlock_N_ / ck_tile::get_warp_size();
        }
    }();

--- a/example/ck_tile/15_fused_moe/CMakeLists.txt
+++ b/example/ck_tile/15_fused_moe/CMakeLists.txt
@@ -1,7 +1,7 @@
 set(TILE_EXAPMLE_FUSED_MOE "tile_example_fused_moe")
 # not using add_example_executable() to add this target, since we don't want this to have
 # to be included in "make all/install/check"
-message("adding ${TILE_EXAPMLE_FUSED_MOE}")
+message(DEBUG "adding ${TILE_EXAPMLE_FUSED_MOE}")
 file(GLOB INSTANCE_SRCS instances/*.cpp)
 add_executable(${TILE_EXAPMLE_FUSED_MOE} EXCLUDE_FROM_ALL main.cpp)
 target_include_directories(${TILE_EXAPMLE_FUSED_MOE} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
--- a/example/ck_tile/15_fused_moe/fused_moe.hpp
+++ b/example/ck_tile/15_fused_moe/fused_moe.hpp
@@ -16,6 +16,7 @@ struct fused_moe_args
    const void* d_scale_ptr;           // [e, 1, k], down scale
    const void* y_smooth_scale_ptr;    // [e, 1, n], smooth-quant-scale for 2nd gemm input
    const void* local_expert_mask_ptr; // [e], local_expert_mask_ptr for EP
+    const void* local_tokens;          // [1] if not nullptr, tokens read from here
    void* o_ptr;                       // [m, k], output token (no need to do zeroing)
    void* ws_ptr;                      // size is moe_sorting_get_workspace_size()
                                       // if return zero, then could be nullptr
@@ -56,4 +57,6 @@ struct fused_moe_traits
    bool local_expert_masking; // if mask experts as local expert
 };

+// if return zero, no ws needed
+int fused_moe_get_workspace_size(int tokens, int num_experts, int topk);
 float fused_moe(fused_moe_traits, fused_moe_args, const ck_tile::stream_config&);
--- a/example/ck_tile/15_fused_moe/fused_moesorting.hpp
+++ b/example/ck_tile/15_fused_moe/fused_moesorting.hpp
@@ -18,4 +18,5 @@ struct fused_moesorting_args : public ck_tile::MoeSortingHostArgs
 {
 };

+int fused_moe_get_workspace_size(int tokens, int num_experts, int topk);
 float fused_moesorting(fused_moesorting_trait t, fused_moesorting_args a, ck_tile::stream_config s);
--- a/example/ck_tile/15_fused_moe/instances/fused_moe_api.cpp
+++ b/example/ck_tile/15_fused_moe/instances/fused_moe_api.cpp
@@ -2,6 +2,13 @@
 // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.

 #include "fused_moe.hpp"
+#include "ck_tile/ops/fused_moe.hpp"
+
+int fused_moe_get_workspace_size(int tokens, int num_experts, int topk)
+{
+    return ck_tile::moe_sorting_get_workspace_size(
+        tokens, num_experts, topk, 0 /*dispatch policy*/);
+}

 float fused_moe(fused_moe_traits t, fused_moe_args a, const ck_tile::stream_config& s)
 {
@@ -19,20 +26,27 @@ float fused_moe(fused_moe_traits t, fused_moe_args a, const ck_tile::stream_conf

    auto t0 = fused_moesorting_trait{"int32", "fp32", t.local_expert_masking};
    auto a0 = fused_moesorting_args{
-        a.topk_ids_ptr,                              // const void* p_topk_ids;
-        a.topk_weight_ptr,                           // const void* p_weights;
-        a.local_expert_mask_ptr,                     // const void* p_local_expert_mask;
-        a.sorted_token_ids_ptr,                      // void* p_sorted_token_ids;
-        a.sorted_weight_ptr,                         // void* p_sorted_weights;
-        a.sorted_expert_ids_ptr,                     // void* p_sorted_expert_ids;
-        a.num_sorted_tiles_ptr,                      // void* p_total_tokens_post_pad;
-        a.o_ptr,                                     // void* p_moe_buf;
-        a.ws_ptr,                                    // void* p_ws;
-        a.num_tokens,                                // index_t tokens;
-        a.block_m,                                   // index_t unit_size;
-        a.num_experts,                               // index_t num_experts;
-        a.topk,                                      // index_t topk;
-        a.num_tokens * a.stride_token * o_data_bytes // index_t moe_buf_bytes;
+        a.topk_ids_ptr,          // const void* p_topk_ids;
+        a.topk_weight_ptr,       // const void* p_weights;
+        a.local_expert_mask_ptr, // const void* p_local_expert_mask;
+        a.local_tokens,
+        a.sorted_token_ids_ptr,  // void* p_sorted_token_ids;
+        a.sorted_weight_ptr,     // void* p_sorted_weights;
+        a.sorted_expert_ids_ptr, // void* p_sorted_expert_ids;
+        a.num_sorted_tiles_ptr,  // void* p_total_tokens_post_pad;
+        a.o_ptr,                 // void* p_moe_buf;
+        a.ws_ptr,                // void* p_ws;
+        a.num_tokens,            // index_t tokens;
+        a.block_m,               // index_t unit_size;
+        a.num_experts,           // index_t num_experts;
+        a.topk,                  // index_t topk;
+#if MOE_SORTING_FMOE_2D_BUF
+        a.stride_token,
+        o_data_bytes,
+#else
+        static_cast<ck_tile::long_index_t>(a.num_tokens) * a.stride_token *
+            o_data_bytes // index_t moe_buf_bytes;
+#endif
    };

    auto t1 = fused_moegemm_traits{t.prec_i,
--- a/example/ck_tile/15_fused_moe/instances/fused_moegemm_api_internal.hpp
+++ b/example/ck_tile/15_fused_moe/instances/fused_moegemm_api_internal.hpp
@@ -16,11 +16,11 @@ float fused_moegemm_(const ck_tile::stream_config& s, fused_moegemm_args a)
 {
    using f_traits = ck_tile::FusedMoeGemmTraits<Ts_::GateOnly, Ts_::FusedQuant == 1, 1 /*atomic*/>;
    using f_shape  = ck_tile::FusedMoeGemmShape<typename Ts_::BlockTile_0,
-                                               typename Ts_::WarpPerBlock_0,
-                                               typename Ts_::WarpTile_0,
-                                               typename Ts_::BlockTile_1,
-                                               typename Ts_::WarpPerBlock_0,
-                                               typename Ts_::WarpTile_0>;
+                                                typename Ts_::WarpPerBlock_0,
+                                                typename Ts_::WarpTile_0,
+                                                typename Ts_::BlockTile_1,
+                                                typename Ts_::WarpPerBlock_0,
+                                                typename Ts_::WarpTile_0>;

    constexpr auto get_activation_ = []() {
        if constexpr(Ts_::Activation == 0)
--- a/example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp
+++ b/example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp
@@ -7,6 +7,14 @@
 #define MOE_SORTING_USE_EX_KERNEL 1
 #endif

+#ifndef MOE_SORTING_SUPPORT_LARGE_EXPERT
+#define MOE_SORTING_SUPPORT_LARGE_EXPERT 0
+#endif
+
+#ifndef MOE_SORTING_SUPPORT_LARGE_TOPK
+#define MOE_SORTING_SUPPORT_LARGE_TOPK 0
+#endif
+
 #if !MOE_SORTING_USE_EX_KERNEL

 #define MOE_SORTING_DISPATCH_ETILE(unroll_num_, expert_tile_)                         \
@@ -25,15 +33,18 @@

 #else

-#define MOE_SORTING_DISPATCH_(sub_token_tile_, sub_token_onshot_, local_expert_masking_)                \
+#define MOE_SORTING_DISPATCH_(                                                                          \
+    sub_token_tile_, sub_token_onshot_, local_expert_masking_, local_token_)                            \
    constexpr ck_tile::index_t sub_token_tile = sub_token_tile_;                                        \
    constexpr bool sub_token_onshot           = sub_token_onshot_;                                      \
    constexpr bool local_expert_masking       = local_expert_masking_;                                  \
+    constexpr bool local_token                = local_token_;                                           \
    using ms_problem                          = ck_tile::MoeSortingProblemEx<index_t,                   \
-                                                    ms_weight_type,            \
-                                                    sub_token_tile,            \
-                                                    sub_token_onshot,          \
-                                                    local_expert_masking>;     \
+                                                                             ms_weight_type,            \
+                                                                             sub_token_tile,            \
+                                                                             sub_token_onshot,          \
+                                                                             local_expert_masking,      \
+                                                                             local_token>;              \
    using kernel                              = ck_tile::MoeSortingKernel<ms_problem>;                  \
    auto kargs                                = kernel::MakeKargs(a);                                   \
    const dim3 grids                          = kernel::GridSize(a);                                    \
@@ -43,32 +54,43 @@
        s, ck_tile::make_kernel(kernel{}, grids, blocks, lds_bytes, kargs)); \
    return ave_time;

-#define MOE_SORTING_DISPATCH_SUB_TOKEN_(row_, sub_token_onshot_, local_expert_masking_) \
-    if(row_ % 8 == 0)                                                                   \
-    {                                                                                   \
-        MOE_SORTING_DISPATCH_(8, sub_token_onshot_, local_expert_masking_);             \
-    }                                                                                   \
-    else if(row_ % 4 == 0)                                                              \
-    {                                                                                   \
-        MOE_SORTING_DISPATCH_(4, sub_token_onshot_, local_expert_masking_);             \
-    }                                                                                   \
-    else if(row_ % 2 == 0)                                                              \
-    {                                                                                   \
-        MOE_SORTING_DISPATCH_(2, sub_token_onshot_, local_expert_masking_);             \
-    }                                                                                   \
-    else                                                                                \
-    {                                                                                   \
-        MOE_SORTING_DISPATCH_(1, sub_token_onshot_, local_expert_masking_);             \
+#define MOE_SORTING_DISPATCH_SUB_TOKEN_(                                                  \
+    row_, sub_token_onshot_, local_expert_masking_, local_token_)                         \
+    if(row_ % 8 == 0)                                                                     \
+    {                                                                                     \
+        MOE_SORTING_DISPATCH_(8, sub_token_onshot_, local_expert_masking_, local_token_); \
+    }                                                                                     \
+    else if(row_ % 4 == 0)                                                                \
+    {                                                                                     \
+        MOE_SORTING_DISPATCH_(4, sub_token_onshot_, local_expert_masking_, local_token_); \
+    }                                                                                     \
+    else if(row_ % 2 == 0)                                                                \
+    {                                                                                     \
+        MOE_SORTING_DISPATCH_(2, sub_token_onshot_, local_expert_masking_, local_token_); \
+    }                                                                                     \
+    else                                                                                  \
+    {                                                                                     \
+        MOE_SORTING_DISPATCH_(1, sub_token_onshot_, local_expert_masking_, local_token_); \
    }

-#define MOE_SORTING_DISPATCH_SUBTO_(row_, local_expert_masking_)            \
-    if(is_sub_token_onshot)                                                 \
-    {                                                                       \
-        MOE_SORTING_DISPATCH_SUB_TOKEN_(row_, true, local_expert_masking_)  \
-    }                                                                       \
-    else                                                                    \
-    {                                                                       \
-        MOE_SORTING_DISPATCH_SUB_TOKEN_(row_, false, local_expert_masking_) \
+#define MOE_SORTING_DISPATCH_DYNAMIC_TOKEN_(row_, sub_token_onshot_, local_expert_masking_)    \
+    if(is_local_token)                                                                         \
+    {                                                                                          \
+        MOE_SORTING_DISPATCH_SUB_TOKEN_(row_, sub_token_onshot_, local_expert_masking_, true)  \
+    }                                                                                          \
+    else                                                                                       \
+    {                                                                                          \
+        MOE_SORTING_DISPATCH_SUB_TOKEN_(row_, sub_token_onshot_, local_expert_masking_, false) \
+    }
+
+#define MOE_SORTING_DISPATCH_SUBTO_(row_, local_expert_masking_)                \
+    if(is_sub_token_onshot)                                                     \
+    {                                                                           \
+        MOE_SORTING_DISPATCH_DYNAMIC_TOKEN_(row_, true, local_expert_masking_)  \
+    }                                                                           \
+    else                                                                        \
+    {                                                                           \
+        MOE_SORTING_DISPATCH_DYNAMIC_TOKEN_(row_, false, local_expert_masking_) \
    }

 #define MOE_SORTING_DISPATCH_EMASK_(row_)        \
@@ -107,6 +129,10 @@
    }
 #endif

+float fused_moesorting_mp(fused_moesorting_trait t,
+                          fused_moesorting_args a,
+                          ck_tile::stream_config s);
+
 float fused_moesorting(fused_moesorting_trait t, fused_moesorting_args a, ck_tile::stream_config s)
 {
    if(t.weight_type == "fp32" && t.index_type == "int32")
@@ -153,18 +179,240 @@ float fused_moesorting(fused_moesorting_trait t, fused_moesorting_args a, ck_til
        }
        }
 #else
-        using index_t            = ck_tile::index_t;
-        using ms_weight_type     = float;
-        auto [r_, c_]            = ck_tile::moe_sorting_get_smem_row_col(a.tokens, a.num_experts);
-        auto sub_token_          = r_ - 2;
-        r_                       = (r_ - 2) / 8;
-        bool is_sub_token_onshot = a.tokens <= sub_token_;
+        if(fused_moe_get_workspace_size(a.tokens, a.num_experts, a.topk) != 0)
+        {
+            return fused_moesorting_mp(t, a, s);
+        }
+        using index_t                = ck_tile::index_t;
+        using ms_weight_type         = float;
+        auto sub_token_              = ck_tile::moe_sorting_get_sub_token(a.tokens, a.num_experts);
+        auto row_                    = sub_token_ / 8;
+        bool is_sub_token_onshot     = a.tokens <= sub_token_;
        bool is_local_expert_masking = t.local_expert_masking;
-        (void)c_;
+        bool is_local_token          = a.p_local_tokens != nullptr;

-        MOE_SORTING_DISPATCH_EMASK_(r_);
+        MOE_SORTING_DISPATCH_EMASK_(row_);
        // MOE_SORTING_DISPATCH_ETILE(0, 0);
 #endif
    }
    return -1;
 }
+
+#define MOE_SORTING_MP_0(mesh_type_, unroll_num_, expert_masking_, local_token_)                    \
+    [&]() {                                                                                         \
+        constexpr ck_tile::index_t unroll_num = unroll_num_;                                        \
+        constexpr bool expert_masking         = expert_masking_;                                    \
+        constexpr bool local_token            = local_token_;                                       \
+        using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,            \
+                                                                             ms_weight_type,        \
+                                                                             mesh_type_,            \
+                                                                             unroll_num,            \
+                                                                             expert_masking,        \
+                                                                             local_token>;          \
+        using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P0<ms_problem>; \
+        auto kargs                            = kernel::MakeKargs(a);                               \
+        const dim3 grids                      = kernel::GridSize(a);                                \
+        const dim3 blocks                     = kernel::BlockSize(a);                               \
+        return ck_tile::make_kernel<kernel::BLOCK_SIZE>(kernel{}, grids, blocks, 0, kargs);         \
+    }()
+
+#define MOE_SORTING_MP_1(mesh_type_, unroll_num_, expert_masking_, local_token_)                    \
+    [&]() {                                                                                         \
+        constexpr ck_tile::index_t unroll_num = unroll_num_;                                        \
+        constexpr bool expert_masking         = expert_masking_;                                    \
+        constexpr bool local_token            = local_token_;                                       \
+        using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,            \
+                                                                             ms_weight_type,        \
+                                                                             mesh_type_,            \
+                                                                             unroll_num,            \
+                                                                             expert_masking,        \
+                                                                             local_token>;          \
+        using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P1<ms_problem>; \
+        auto kargs                            = kernel::MakeKargs(a);                               \
+        const dim3 grids                      = kernel::GridSize(a);                                \
+        const dim3 blocks                     = kernel::BlockSize(a);                               \
+        return ck_tile::make_kernel<kernel::BLOCK_SIZE>(kernel{}, grids, blocks, 0, kargs);         \
+    }()
+#if MOE_SORTING_SUPPORT_LARGE_EXPERT
+#define MOE_SORTING_MP_2(mesh_type_, unroll_num_, expert_masking_, local_token_)                    \
+    [&]() {                                                                                         \
+        constexpr ck_tile::index_t unroll_num = unroll_num_;                                        \
+        constexpr bool expert_masking         = expert_masking_;                                    \
+        constexpr bool local_token            = local_token_;                                       \
+        using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,            \
+                                                                             ms_weight_type,        \
+                                                                             mesh_type_,            \
+                                                                             unroll_num,            \
+                                                                             expert_masking,        \
+                                                                             local_token>;          \
+        using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P2<ms_problem>; \
+        auto kargs                            = kernel::MakeKargs(a);                               \
+        const dim3 grids                      = kernel::GridSize(a);                                \
+        const dim3 blocks                     = kernel::BlockSize(a);                               \
+        return ck_tile::make_kernel(kernel{}, grids, blocks, 0, kargs);                             \
+    }()
+
+#define MOE_SORTING_MP_3(mesh_type_, unroll_num_, expert_masking_, local_token_)                    \
+    [&]() {                                                                                         \
+        constexpr ck_tile::index_t unroll_num = unroll_num_;                                        \
+        constexpr bool expert_masking         = expert_masking_;                                    \
+        constexpr bool local_token            = local_token_;                                       \
+        using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,            \
+                                                                             ms_weight_type,        \
+                                                                             mesh_type_,            \
+                                                                             unroll_num,            \
+                                                                             expert_masking,        \
+                                                                             local_token>;          \
+        using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P3<ms_problem>; \
+        auto kargs                            = kernel::MakeKargs(a);                               \
+        const dim3 grids                      = kernel::GridSize(a);                                \
+        const dim3 blocks                     = kernel::BlockSize(a);                               \
+        return ck_tile::make_kernel(kernel{}, grids, blocks, 0, kargs);                             \
+    }()
+#endif
+
+#define MOE_SORTING_MP_23(mesh_type_, unroll_num_, expert_masking_, local_token_)                    \
+    [&]() {                                                                                          \
+        constexpr ck_tile::index_t unroll_num = unroll_num_;                                         \
+        constexpr bool expert_masking         = expert_masking_;                                     \
+        constexpr bool local_token            = local_token_;                                        \
+        using ms_problem                      = ck_tile::MoeSortingProblemMp<ms_index_t,             \
+                                                                             ms_weight_type,         \
+                                                                             mesh_type_,             \
+                                                                             unroll_num,             \
+                                                                             expert_masking,         \
+                                                                             local_token>;           \
+        using kernel                          = ck_tile::MoeSortingMultiPhaseKernel_P23<ms_problem>; \
+        auto kargs                            = kernel::MakeKargs(a);                                \
+        const dim3 grids                      = kernel::GridSize(a);                                 \
+        const dim3 blocks                     = kernel::BlockSize(a);                                \
+        const auto lds_size                   = kernel::GetSmemSize(a);                              \
+        return ck_tile::make_kernel<kernel::BLOCK_SIZE>(kernel{}, grids, blocks, lds_size, kargs);   \
+    }()
+
+#define MOR_SORTING_MP_DISPATCH_(mesh_type_, token_vec_0_, token_vec_1_, token_vec_23_)            \
+    if(t.local_expert_masking)                                                                     \
+    {                                                                                              \
+        if(is_local_token)                                                                         \
+        {                                                                                          \
+            float ave_time =                                                                       \
+                ck_tile::launch_kernel(s,                                                          \
+                                       MOE_SORTING_MP_0(mesh_type_, token_vec_0_, true, true),     \
+                                       MOE_SORTING_MP_1(mesh_type_, token_vec_1_, true, true),     \
+                                       MOE_SORTING_MP_23(mesh_type_, token_vec_23_, true, true));  \
+            return ave_time;                                                                       \
+        }                                                                                          \
+        else                                                                                       \
+        {                                                                                          \
+            float ave_time =                                                                       \
+                ck_tile::launch_kernel(s,                                                          \
+                                       MOE_SORTING_MP_0(mesh_type_, token_vec_0_, true, false),    \
+                                       MOE_SORTING_MP_1(mesh_type_, token_vec_1_, true, false),    \
+                                       MOE_SORTING_MP_23(mesh_type_, token_vec_23_, true, false)); \
+            return ave_time;                                                                       \
+        }                                                                                          \
+    }                                                                                              \
+    else                                                                                           \
+    {                                                                                              \
+        if(is_local_token)                                                                         \
+        {                                                                                          \
+            float ave_time =                                                                       \
+                ck_tile::launch_kernel(s,                                                          \
+                                       MOE_SORTING_MP_0(mesh_type_, token_vec_0_, false, true),    \
+                                       MOE_SORTING_MP_1(mesh_type_, token_vec_1_, false, true),    \
+                                       MOE_SORTING_MP_23(mesh_type_, token_vec_23_, false, true)); \
+            return ave_time;                                                                       \
+        }                                                                                          \
+        else                                                                                       \
+        {                                                                                          \
+            float ave_time = ck_tile::launch_kernel(                                               \
+                s,                                                                                 \
+                MOE_SORTING_MP_0(mesh_type_, token_vec_0_, false, false),                          \
+                MOE_SORTING_MP_1(mesh_type_, token_vec_1_, false, false),                          \
+                MOE_SORTING_MP_23(mesh_type_, token_vec_23_, false, false));                       \
+            return ave_time;                                                                       \
+        }                                                                                          \
+    }
+
+float fused_moesorting_mp(fused_moesorting_trait t,
+                          fused_moesorting_args a,
+                          ck_tile::stream_config s)
+{
+    bool is_local_token = a.p_local_tokens != nullptr;
+    if(t.weight_type == "fp32" && t.index_type == "int32")
+    {
+        using ms_index_t     = ck_tile::index_t;
+        using ms_weight_type = float;
+
+        if(ck_tile::impl::moe_sorting_get_smem_size_p23(a.num_experts) >
+           ck_tile::get_smem_capacity())
+        {
+#if MOE_SORTING_SUPPORT_LARGE_EXPERT
+            if(t.local_expert_masking)
+            {
+                float ave_time = ck_tile::launch_kernel(s,
+                                                        MOE_SORTING_MP_0(ms_index_t, 1, true),
+                                                        MOE_SORTING_MP_1(ms_index_t, 1, true),
+                                                        MOE_SORTING_MP_2(ms_index_t, 1, true),
+                                                        MOE_SORTING_MP_3(ms_index_t, 1, true));
+                return ave_time;
+            }
+            else
+            {
+                float ave_time = ck_tile::launch_kernel(s,
+                                                        MOE_SORTING_MP_0(ms_index_t, 1, false),
+                                                        MOE_SORTING_MP_1(ms_index_t, 1, false),
+                                                        MOE_SORTING_MP_2(ms_index_t, 1, false),
+                                                        MOE_SORTING_MP_3(ms_index_t, 1, false));
+                return ave_time;
+            }
+#else
+            printf("do not support large expert %d\n", a.num_experts);
+            return -1;
+#endif
+        }
+        else
+        {
+            ck_tile::index_t mesh_byte_size =
+                ck_tile::impl::moe_sorting_mesh_byte_size(a.tokens, a.num_experts, a.topk);
+            if(mesh_byte_size == 1)
+            {
+                if(a.tokens * a.topk % 4 == 0)
+                {
+                    MOR_SORTING_MP_DISPATCH_(uint8_t, 4, 16, 16)
+                }
+                else
+                {
+                    MOR_SORTING_MP_DISPATCH_(uint8_t, 1, 16, 16)
+                }
+            }
+            else if(mesh_byte_size == 2)
+            {
+#if MOE_SORTING_SUPPORT_LARGE_TOPK
+                if(a.tokens * a.topk % 4 == 0)
+                {
+                    MOR_SORTING_MP_DISPATCH_(uint16_t, 4, 8, 8)
+                }
+                else
+                {
+                    MOR_SORTING_MP_DISPATCH_(uint16_t, 1, 8, 8)
+                }
+#else
+                printf("do not support large topk %d\n", a.topk);
+                return -1;
+#endif
+            }
+            else
+            {
+                MOR_SORTING_MP_DISPATCH_(ck_tile::index_t, 1, 1, 1)
+            }
+        }
+    }
+    return -1;
+}
+
+int fused_moesorting_get_workspace_size(int tokens, int num_experts, int topk)
+{
+    return ck_tile::moe_sorting_get_workspace_size(
+        tokens, num_experts, topk, 0 /*dispatch policy*/);
+}
--- a/example/ck_tile/15_fused_moe/main.cpp
+++ b/example/ck_tile/15_fused_moe/main.cpp
@@ -87,7 +87,18 @@ void topid_unique_gen(
 auto create_args(int argc, char* argv[])
 {
    ck_tile::ArgParser arg_parser;
-    arg_parser.insert("t", "128", "num input tokens")
+    arg_parser
+        .insert("t",
+                "128",
+                "number of input tokens.\n"
+                "If \"local_t\" presents, this value indicates global concurrency of all ranks.")
+        .insert(
+            "local_t",
+            "-1",
+            "Number of local input tokens for curent rank.\n"
+            "This value must be within range \"[0, t)\", or \"-1\"(no such feature)\n"
+            "This feature is to simulate EP case where where each rank has different tokens.\n"
+            "Besides, this value will be stored in a GPU buffer, which is friendly for CUDA graph.")
        .insert("e", "32", "num of experts")
        .insert("k", "5", "topk")
        .insert("h", "8192", "hidden_size of this model")
@@ -131,6 +142,7 @@ template <typename I, typename W, typename O, typename ST, typename SW, typename
 bool run(const ck_tile::ArgParser& arg_parser)
 {
    ck_tile::index_t tokens            = arg_parser.get_int("t");
+    ck_tile::index_t local_tokens      = arg_parser.get_int("local_t");
    ck_tile::index_t experts           = arg_parser.get_int("e");
    ck_tile::index_t topk              = arg_parser.get_int("k");
    ck_tile::index_t hidden_size       = arg_parser.get_int("h");
@@ -169,6 +181,14 @@ bool run(const ck_tile::ArgParser& arg_parser)
    // w1 (Down, N size)
    ck_tile::index_t shared_intermediate_size_1 = intermediate_size / tp;

+    bool is_local_token = local_tokens >= 0 && local_tokens < tokens;
+
+    if(local_tokens > tokens)
+    {
+        printf("local_tokens:%d larger than tokens:%d, invalid\n", local_tokens, tokens);
+        return false;
+    }
+
    auto prec_str = [&]() {
        auto base_str = prec_i;
        if(prec_i != prec_w)
@@ -198,11 +218,16 @@ bool run(const ck_tile::ArgParser& arg_parser)
            return std::string(", st:") + std::to_string(stride);
    }();

+    std::cout << "[" << api_str << "|" << prec_str << "]" << " t:" << tokens;
+
+    if(is_local_token)
+    {
+        std::cout << "(" << local_tokens << ")";
+    }
+
    std::cout
-        << "[" << api_str << "|" << prec_str << "]"
-        << " t:" << tokens << ", e:" << experts << ", k:" << topk << stride_str
-        << ", hidden:" << hidden_size << ", interm:" << intermediate_size << ", tp:" << tp
-        << ", act:"
+        << ", e:" << experts << ", k:" << topk << stride_str << ", hidden:" << hidden_size
+        << ", interm:" << intermediate_size << ", tp:" << tp << ", act:"
        << activation
        // << ", shrd_interm:" << shared_intermediate_size_0 << "|" << shared_intermediate_size_1
        << (gate_only ? ", g1u0" : ", g1u1") << ", q:" << fused_quant << std::flush;
@@ -372,10 +397,16 @@ bool run(const ck_tile::ArgParser& arg_parser)
            num_sorted_tiles_host.get_element_space_size_in_bytes());

        // if return zero, means no need workspace, can set moe_sorting_args.p_ws to nullptr
-        ck_tile::index_t workspace_size = ck_tile::moe_sorting_get_workspace_size(tokens, experts);
+        ck_tile::index_t workspace_size =
+            ck_tile::moe_sorting_get_workspace_size(tokens, experts, topk, 0 /*dispatch_policy*/);
        ck_tile::DeviceMem moe_sorting_ws(workspace_size != 0 ? workspace_size : 0);
        if(workspace_size != 0)
            moe_sorting_ws.SetZero(); // note, clear here!!!!
+        ck_tile::DeviceMem local_tokens_dev(sizeof(ck_tile::index_t));
+        if(is_local_token)
+        {
+            local_tokens_dev.ToDevice(&local_tokens);
+        }

        fused_moe_traits traits{prec_i,
                                prec_w,
@@ -399,6 +430,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
                            fused_quant == 1 ? sy_buf.GetDeviceBuffer() : nullptr,
                            local_expert_masking ? local_expert_mask_buf.GetDeviceBuffer()
                                                 : nullptr,
+                            is_local_token ? local_tokens_dev.GetDeviceBuffer() : nullptr,
                            o_buf.GetDeviceBuffer(),
                            workspace_size != 0 ? moe_sorting_ws.GetDeviceBuffer() : nullptr,
                            topk_ids_buf.GetDeviceBuffer(),
@@ -462,6 +494,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
                num_sorted_tiles_host.mData[0],
                experts,
                block_m,
+                is_local_token ? local_tokens : tokens,
                local_expert_masking);
            if(activation == 0)
            {
@@ -494,6 +527,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
            num_sorted_tiles_host.mData[0],
            experts,
            block_m,
+            is_local_token ? local_tokens : tokens,
            local_expert_masking);

        // done, preparing GPU buffer
@@ -505,6 +539,11 @@ bool run(const ck_tile::ArgParser& arg_parser)
        ck_tile::DeviceMem sd_buf(sd_host);
        ck_tile::DeviceMem sy_buf(sy_host);
        ck_tile::DeviceMem o_buf(o_host);
+        ck_tile::DeviceMem local_tokens_dev(sizeof(ck_tile::index_t));
+        if(is_local_token)
+        {
+            local_tokens_dev.ToDevice(&local_tokens);
+        }

        // manually clear output buffer for atomic
        o_buf.SetZero();
@@ -541,7 +580,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
                                num_sorted_tiles_buf.GetDeviceBuffer(),
                                hidden_size,
                                intermediate_size / tp,
-                                tokens,
+                                is_local_token ? local_tokens : tokens,
                                experts,
                                topk,
                                stride};
--- a/example/ck_tile/16_batched_gemm/batched_gemm.cpp
+++ b/example/ck_tile/16_batched_gemm/batched_gemm.cpp
@@ -15,7 +15,16 @@
 #include "ck_tile/host.hpp"
 #include "batched_gemm.hpp"

-template <typename ALayout, typename BLayout, typename CLayout>
+template <typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename CLayout,
+          typename CDEElementWise = ck_tile::element_wise::PassThrough>
 float batched_gemm(const ck_tile::BatchedGemmHostArgs& args, const ck_tile::stream_config& s)
 {
 #if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
@@ -106,191 +115,104 @@ float batched_gemm(const ck_tile::BatchedGemmHostArgs& args, const ck_tile::stre

    float ave_time{0};

-    const auto Run = [&](const auto has_hot_loop_, const auto tail_number_) {
-        constexpr bool has_hot_loop_v = has_hot_loop_.value;
-        constexpr auto tail_number_v  = tail_number_.value;
-        constexpr auto scheduler      = GEMM_PIPELINE_SCHEDULER;
+    const auto Run =
+        [&](const auto has_hot_loop_, const auto tail_number_, const auto memory_operation_) {
+            constexpr bool has_hot_loop_v   = has_hot_loop_.value;
+            constexpr auto tail_number_v    = tail_number_.value;
+            constexpr auto scheduler        = GEMM_PIPELINE_SCHEDULER;
+            constexpr auto memory_operation = memory_operation_.value;

-        using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
-                                                                           BDataType,
-                                                                           AccDataType,
-                                                                           GemmShape,
-                                                                           GemmUniversalTraits,
-                                                                           scheduler,
-                                                                           has_hot_loop_v,
-                                                                           tail_number_v>;
+            using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                                               BDataType,
+                                                                               AccDataType,
+                                                                               GemmShape,
+                                                                               GemmUniversalTraits,
+                                                                               scheduler,
+                                                                               has_hot_loop_v,
+                                                                               tail_number_v>;

-        using GemmPipeline = GEMM_PIPELINE<UniversalGemmProblem>;
-        using GemmEpilogue = ck_tile::CShuffleEpilogue<
-            ck_tile::CShuffleEpilogueProblem<ADataType,
-                                             BDataType,
-                                             AccDataType,
-                                             CDataType,
-                                             CLayout,
-                                             GemmPipelineProblem::kBlockSize,
-                                             TilePartitioner::MPerBlock,
-                                             TilePartitioner::NPerBlock,
-                                             M_Warp,
-                                             N_Warp,
-                                             M_Warp_Tile,
-                                             N_Warp_Tile,
-                                             K_Warp_Tile,
-                                             UniversalGemmProblem::TransposeC>>;
-        using Kernel = ck_tile::BatchedGemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
-        auto kargs   = Kernel::MakeKernelArgs(args);
+            using GemmPipeline = GEMM_PIPELINE<UniversalGemmProblem>;

-        const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch, args.batch_count);
-        constexpr dim3 blocks = Kernel::BlockSize();
+            using GemmEpilogue = ck_tile::CShuffleEpilogue<
+                ck_tile::CShuffleEpilogueProblem<ADataType,
+                                                 BDataType,
+                                                 DsDataType,
+                                                 AccDataType,
+                                                 CDataType,
+                                                 DsLayout,
+                                                 CLayout,
+                                                 CDEElementWise,
+                                                 GemmPipelineProblem::kBlockSize,
+                                                 TilePartitioner::MPerBlock,
+                                                 TilePartitioner::NPerBlock,
+                                                 M_Warp,
+                                                 N_Warp,
+                                                 M_Warp_Tile,
+                                                 N_Warp_Tile,
+                                                 K_Warp_Tile,
+                                                 UniversalGemmProblem::TransposeC,
+                                                 memory_operation>>;

-        if(!Kernel::IsSupportedArgument(kargs))
+            using Kernel = ck_tile::BatchedGemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
+            auto kargs   = Kernel::MakeKernelArgs(args);
+
+            const dim3 grids = Kernel::GridSize(args.M, args.N, args.k_batch, args.batch_count);
+            constexpr dim3 blocks = Kernel::BlockSize();
+
+            if(!Kernel::IsSupportedArgument(kargs))
+            {
+                throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
+            }
+
+            if(s.log_level_ > 0)
+            {
+                std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
+                          << "shape: " << GemmShape::GetName() << '\n'
+                          << "problem: " << GemmPipelineProblem::GetName() << '\n'
+                          << "pipeline: " << GemmPipeline::GetName() << '\n'
+                          << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                          << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z
+                          << "}" << std::endl;
+            }
+
+            ave_time = ck_tile::launch_kernel(
+                s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+            return ave_time;
+        };
+
+    const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
+        if(args.k_batch == 1)
        {
-            throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
+            Run(has_hot_loop_,
+                tail_number_,
+                ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                           ck_tile::memory_operation_enum::set>{});
        }
-
-        if(s.log_level_ > 0)
+        else
        {
-            std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
-                      << "shape: " << GemmShape::GetName() << '\n'
-                      << "problem: " << GemmPipelineProblem::GetName() << '\n'
-                      << "pipeline: " << GemmPipeline::GetName() << '\n'
-                      << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
-                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
-                      << std::endl;
+            Run(has_hot_loop_,
+                tail_number_,
+                ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                           ck_tile::memory_operation_enum::atomic_add>{});
        }
-
-        ave_time = ck_tile::launch_kernel(
-            s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
-        return ave_time;
    };

-    if(has_hot_loop)
-    {
-#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3)
-        if(tail_num == ck_tile::TailNumber::Full)
-        {
-            Run(ck_tile::bool_constant<true>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
-        }
-        else if(tail_num == ck_tile::TailNumber::Odd)
-        {
-            Run(ck_tile::bool_constant<true>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
-        }
-        else if(tail_num == ck_tile::TailNumber::Even)
-        {
-            Run(ck_tile::bool_constant<true>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Even>{});
-        }
-        else
-        {
-            std::ostringstream err;
-            err << "Incorrect tail_num for compv3 pipeline! Expected Full, Odd or Even, but got "
-                << tail_num << "\nPrefetchStages: " << BaseGemmPipeline::PrefetchStages
-                << "\n File: " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__;
-            throw std::runtime_error(err.str());
-        }
-#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
-        // Tail pipeline One to Seven
-        if(tail_num == ck_tile::TailNumber::One)
-        {
-            Run(ck_tile::bool_constant<true>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::One>{});
-        }
-        else if(tail_num == ck_tile::TailNumber::Full)
-        {
-            Run(ck_tile::bool_constant<true>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
-        }
-
-        if constexpr(BaseGemmPipeline::PrefetchStages > 2)
-        {
-            if(tail_num == ck_tile::TailNumber::Two)
-            {
-                Run(ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Two>{});
-            }
-        }
-        if constexpr(BaseGemmPipeline::PrefetchStages > 3)
-        {
-            if(tail_num == ck_tile::TailNumber::Three)
-            {
-                Run(ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Three>{});
-            }
-        }
-        if constexpr(BaseGemmPipeline::PrefetchStages > 4)
-        {
-            if(tail_num == ck_tile::TailNumber::Four)
-            {
-                Run(ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Four>{});
-            }
-        }
-        if constexpr(BaseGemmPipeline::PrefetchStages > 5)
-        {
-            if(tail_num == ck_tile::TailNumber::Five)
-            {
-                Run(ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Five>{});
-            }
-        }
-        if constexpr(BaseGemmPipeline::PrefetchStages > 6)
-        {
-            if(tail_num == ck_tile::TailNumber::Six)
-            {
-                Run(ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Six>{});
-            }
-        }
-        if constexpr(BaseGemmPipeline::PrefetchStages > 7)
-        {
-            if(tail_num == ck_tile::TailNumber::Seven)
-            {
-                Run(ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Seven>{});
-            }
-        }
-#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V4)
-        if(tail_num == ck_tile::TailNumber::Three)
-        {
-            Run(ck_tile::bool_constant<true>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Three>{});
-        }
-        else
-        {
-            Run(ck_tile::bool_constant<true>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Two>{});
-        }
-#endif
-    }
-    else
-    {
-        if(tail_num == ck_tile::TailNumber::Full)
-        {
-            Run(ck_tile::bool_constant<false>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
-        }
-        else if(tail_num == ck_tile::TailNumber::Odd)
-        {
-            Run(ck_tile::bool_constant<false>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
-        }
-        else if(tail_num == ck_tile::TailNumber::Even)
-        {
-            Run(ck_tile::bool_constant<false>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
-        }
-        std::ostringstream err;
-        err << "Incorrect tail_num for pipeline without hotloop, expected Full, Odd or Even, but "
-               "got "
-            << tail_num << "\n PrefetchStages: " << BaseGemmPipeline::PrefetchStages
-            << "\n File: " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__;
-        throw std::runtime_error(err.str());
-    }
+    BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);

    return ave_time;
 }

 #include "run_batched_gemm_example.inc"

-int main(int argc, char* argv[]) { return !run_batched_gemm_example(argc, argv); }
+int main(int argc, char* argv[])
+{
+    try
+    {
+        return !run_batched_gemm_example(argc, argv);
+    }
+    catch(const std::runtime_error& e)
+    {
+        std::cerr << "Runtime error: " << e.what() << '\n';
+        return EXIT_FAILURE;
+    }
+}
--- a/example/ck_tile/16_batched_gemm/batched_gemm.hpp
+++ b/example/ck_tile/16_batched_gemm/batched_gemm.hpp
@@ -8,6 +8,7 @@
 #include "ck_tile/core.hpp"
 #include "ck_tile/host/kernel_launch.hpp"
 #include "ck_tile/ops/gemm/kernel/batched_gemm_kernel.hpp"
+#include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp"

 #define CK_TILE_PIPELINE_COMPUTE_V3 1
 #define CK_TILE_PIPELINE_MEMORY 2
--- a/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc
+++ b/example/ck_tile/16_batched_gemm/run_batched_gemm_example.inc
@@ -23,7 +23,16 @@ auto calculate_rtol_atol(const ck_tile::index_t K,
    return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
 }

-template <typename ALayout, typename BLayout, typename CLayout>
+template <typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename CLayout,
+          typename CDEElementWise = ck_tile::element_wise::PassThrough>
 float invoke_batched_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
                          ck_tile::DeviceMem& b_k_n_dev_buf,
                          ck_tile::DeviceMem& c_m_n_dev_buf,
@@ -41,23 +50,31 @@ float invoke_batched_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
                          int n_warmup,
                          int n_repeat)
 {
-    ck_tile::BatchedGemmHostArgs args;
-    args.a_ptr          = a_m_k_dev_buf.GetDeviceBuffer();
-    args.b_ptr          = b_k_n_dev_buf.GetDeviceBuffer();
-    args.c_ptr          = c_m_n_dev_buf.GetDeviceBuffer();
-    args.k_batch        = kbatch;
-    args.M              = M;
-    args.N              = N;
-    args.K              = K;
-    args.stride_A       = stride_A;
-    args.stride_B       = stride_B;
-    args.stride_C       = stride_C;
-    args.batch_stride_A = batch_stride_A;
-    args.batch_stride_B = batch_stride_B;
-    args.batch_stride_C = batch_stride_C;
-    args.batch_count    = batch_count;
+    ck_tile::BatchedGemmHostArgs args{a_m_k_dev_buf.GetDeviceBuffer(),
+                                      b_k_n_dev_buf.GetDeviceBuffer(),
+                                      c_m_n_dev_buf.GetDeviceBuffer(),
+                                      kbatch,
+                                      M,
+                                      N,
+                                      K,
+                                      stride_A,
+                                      stride_B,
+                                      stride_C,
+                                      batch_stride_A,
+                                      batch_stride_B,
+                                      batch_stride_C,
+                                      batch_count};

-    float ave_time = batched_gemm<ALayout, BLayout, CLayout>(
+    float ave_time = batched_gemm<ADataType,
+                                  BDataType,
+                                  DsDataType,
+                                  AccDataType,
+                                  CDataType,
+                                  ALayout,
+                                  BLayout,
+                                  DsLayout,
+                                  CLayout,
+                                  CDEElementWise>(
        args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});

    std::string op_name{"Batched Gemm"};
@@ -169,22 +186,30 @@ int run_batched_gemm_example_with_layouts(int argc,
    c_m_n_dev_buf.SetZero();
    c_m_n_dev_result.SetZero();

-    invoke_batched_gemm<ALayout, BLayout, CLayout>(a_m_k_dev_buf,
-                                                   b_k_n_dev_buf,
-                                                   c_m_n_dev_buf,
-                                                   M,
-                                                   N,
-                                                   K,
-                                                   stride_A,
-                                                   stride_B,
-                                                   stride_C,
-                                                   batch_stride_A,
-                                                   batch_stride_B,
-                                                   batch_stride_C,
-                                                   batch_count,
-                                                   kbatch,
-                                                   n_warmup,
-                                                   n_repeat);
+    invoke_batched_gemm<ADataType,
+                        BDataType,
+                        ck_tile::tuple<>,
+                        AccDataType,
+                        CDataType,
+                        ALayout,
+                        BLayout,
+                        ck_tile::tuple<>,
+                        CLayout>(a_m_k_dev_buf,
+                                 b_k_n_dev_buf,
+                                 c_m_n_dev_buf,
+                                 M,
+                                 N,
+                                 K,
+                                 stride_A,
+                                 stride_B,
+                                 stride_C,
+                                 batch_stride_A,
+                                 batch_stride_B,
+                                 batch_stride_C,
+                                 batch_count,
+                                 kbatch,
+                                 n_warmup,
+                                 n_repeat);
    c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
    bool pass = true;

--- a/example/ck_tile/17_grouped_gemm/CMakeLists.txt
+++ b/example/ck_tile/17_grouped_gemm/CMakeLists.txt
@@ -1,2 +1 @@
 add_executable(tile_example_grouped_gemm EXCLUDE_FROM_ALL grouped_gemm.cpp)
-
--- a/example/ck_tile/17_grouped_gemm/README.md
+++ b/example/ck_tile/17_grouped_gemm/README.md
@@ -1,8 +1,149 @@
-# Grouped CShuffle GEMM
+# Grouped Gemm

-This folder contains example for Grouped GEMM using ck_tile tile-programming implementation. Currently, it only supports the basic feature of the CK Tile GEMM, but creates the placeholders for the future support on different GEMM pipeline and different GEMM modules. In the near future, we will gradually migrate all the GEMM features from old CK to CK Tile.
+Grouped General Matrix Multiplication (Grouped GEMM) is a technique used in GPU computing and high-performance computing to batch together multiple independent GEMM operations (matrix multiplications) into a single kernel launch in order to improve performance and efficiency. This folder contains Grouped GEMM examples that use the ck_tile tile-programming implementation.  

-## build
+## Quick Tour for New Users
+
+The `Grouped GEMM` operators are versions of GEMM that run multiple GEMM operations within a single kernel call. Each GEMM operation performs a matrix multiplication. Unlike regular batched GEMM operations where both matrices must be of the same size and have the same configuration, Grouped GEMM operations can take matrices with different sizes and configurations, making them more flexible for diverse workloads.
+
+Let's now break the example into the following parts: parsing arguments, preparing host and device buffers, preparing data, invoking GEMM, and building the example, while explaining each function.
+
+### Parsing Arguments
+The example takes three arguments: `group_count`, `repeat`, and `warmup`:
+- `group_count`: the number of GEMM operations in the group, 
+- `repeat`: the number of times to repeat the kernel for benchmarking
+- `warmup`: the number of iterations before the actual kernel run time measure.
+
+```cpp
+// Example
+const int group_count = arg_parser.get_int("group_count");
+const int repeat      = arg_parser.get_int("repeat");
+const int warmup      = arg_parser.get_int("warmup");
+```
+In the next step, the input parameters `Ms`, `Ns`, `Ks`, as well as the corresponding `stride_As`, `stride_Bs`, and `stride_Cs` are either provided from the comand line or generated by default. Since one or more input data sets are expected for `A` and `B`, each parameter is stored in a `std::vector`. The size of the `vector` is defined by `group_count`.
+
+```cpp
+// Example
+std::vector<ck_tile::index_t> Ms        = arg_parser.get_int_vec("Ms");
+std::vector<ck_tile::index_t> Ns        = arg_parser.get_int_vec("Ns");
+std::vector<ck_tile::index_t> Ks        = arg_parser.get_int_vec("Ks");
+std::vector<ck_tile::index_t> stride_As = arg_parser.get_int_vec("stride_As");
+std::vector<ck_tile::index_t> stride_Bs = arg_parser.get_int_vec("stride_Bs");
+std::vector<ck_tile::index_t> stride_Cs = arg_parser.get_int_vec("stride_Cs");
+```
+Where:
+- `Ms` is the M dimension of each GEMM.
+- `Ns` is the N dimension of each GEMM.
+- `Ks` is the K dimension of each GEMM.
+- `stride_As` is the stride values for matrix A.
+- `stride_Bs` is the stride  values for matrix B.
+- `stride_Cs` is the stride  values for matrix C.
+
+### HostTensor and Device Memory Buffers (for CPU and GPU) 
+Each parameter `Ms`, `Ns`, `Ks`, `stride_As`, `stride_Bs` and `stride_Cs` contains values for more than one matrix, meaning different matrix sizes and strides can be used for different grouped GEMM computations.
+The next step is to properly load the input values. For each input matrix, `A` and `B`, and for each output matrix, `C`, you need to create both `HostTensor` and `DeviceMemory`, where: 
+- `HostTensor` represents the matrix data on the host (CPU). It stores the data before they are transferred to the device for computation.
+- `DeviceMemory` represents the matrix data on the device (GPU). This will store the data on the GPU for computation during the Grouped GEMM operation.
+
+#### HostTensor Buffers (for CPU)
+In the first step, create `HostTensor` for `A`, `B`, `C`. `HostTensor` allocates memory on the host (CPU) to store the matrices, initializing the memory with the appropriate dimensions and values to store the data. Below is an example code showing how to create HostTensors for those tensors:
+```cpp
+// Example
+std::vector<ck_tile::HostTensor<ADataType>> a_m_k_tensors;
+std::vector<ck_tile::HostTensor<BDataType>> b_k_n_tensors;
+std::vector<ck_tile::HostTensor<CDataType>> c_m_n_tensors;
+```
+Where:
+- `a_m_k_tensors` is the vector of `HostTensor` objects for matrix `A` (with dimensions `M × K`). Each tensor stores the data for single GEMM operation.
+- `b_k_n_tensors` is the vector of `HostTensor` objects for matrix `B` (with dimensions `K × N`).
+- `c_m_n_tensors` is the vector of `HostTensor` objects for matrix `C` (the output matrix with dimensions `M × N`).
+
+The `std::vector` container is used for this purpose throughout. As mentioned above, the number of HostTensors is equal to `group_count`.
+
+#### Device Memory Buffers (for GPU)
+Now it's time to allocate memory on the device (GPU) and transfer the data from `HostTensor` to `DeviceMemory` for actual computation..
+```cpp
+// Example
+std::vector<std::unique_ptr<ck_tile::DeviceMem>> a_m_k_dev_buf;
+std::vector<std::unique_ptr<ck_tile::DeviceMem>> b_k_n_dev_buf;
+std::vector<std::unique_ptr<ck_tile::DeviceMem>> c_m_n_dev_buf;
+``` 
+Where:
+- `a_m_k_dev_buf` is the buffer used for storing matrix A on the GPU.
+- `b_k_n_dev_buf` is the buffer used for storing matrix B on the GPU.
+- `c_m_n_dev_buf` is the buffer used for storing the result matrix C on the GPU.
+
+## Prepare data
+In the next step, the input tensors are populated. A pseudorandom number generator, an existing distribution (e.g., `FillUniformDistribution`), or user data can be used to populate the tensors. Descriptors also need to be create for each input tensor.
+
+Use `get_default_stride` to get the strides for A, B, and C. `get_default_stride` is a template function that calculates the default stride for a 2D array based on whether it is row-major or column-major. Template parameter determines whether the storage order is row-major (true) or column-major (false). The function takes four params `row`, `col`, `stride` and `bool_constant<is_row_major>`. If the stride is explicitly provided (`stride != 0`), the stride is returned as-is. If the stride is not provided (`stride == 0`), the function computes the default stride. For the Row-major order (`is_row_major == true`), the stride is set to the number of columns (col). For the column-major order (`is_row_major == false`), the stride is set to the number of rows (row). This function is useful when working with dynamically allocated 2D arrays, where the user may not specify the stride explicitly. It ensures a natural default stride based on the chosen storage order.
+
+```cpp
+// Example, API
+template <bool is_row_major>
+auto get_default_stride(std::size_t row, std::size_t col, std::size_t stride, bool_constant<is_row_major>) {
+  // code
+}
+```
+
+Where: 
+- `is_row_major` is a bool template parameter that determines whether the storage order is row-major (true) or column-major (false).
+- `row` is the number of rows in the matrix.
+- `col` is the number of columns in the matrix.
+- `stride` is the current stride (the distance between consecutive elements in memory).
+- `bool_constant<is_row_major>` is a tag type that helps in differentiating behavior at compile-time.
+
+Next host descriptors for each of the input tensors, A, B, and C are created. Use the `f_host_tensor_descriptor` function defined below. This function takes four parameters, row, col, stride, and layout, and returns a HostTensorDescriptor based on the specified layout.
+
+```cpp
+// Example for tensor A
+ck_tile::HostTensor<ADataType>(f_host_tensor_descriptor(M, K, stride_As[i], a_layout)))
+```
+
+After creating the host_tensors, create `deviceMem` for each tensor `A`, `B`, and `C`, and then transfer the data to the device. The `get_element_space_size_in_bytes()` function is used to get the buffer size in bytes. Use `ToDevice()` to transfer data from the host to the device. The data that was previously generated (`a_m_k_tensors[i].data()`) is passed as a parameter to `ToDevice()`.
+
+The final step before running the GEMM operation is to retrieve the pointers to the buffers of `A`, `B`, and `C` stored on the device using `->GetDeviceBuffer()` and pack them into a shared container. For example: `gemm_descs.push_back({p_a, p_b, p_c, M, N, K, stride_As[i], stride_Bs[i], stride_Cs[i]})`, where `gemm_descs` is `std::vector<grouped_gemm_kargs> gemm_descs` ([Code](https://github.com/ROCm/composable_kernel/blob/develop/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc#L221)). The container should include values such as:
+```cpp
+struct GroupedGemmHostArgs
+{
+    const void* a_ptr;
+    const void* b_ptr;
+    void* c_ptr;
+    index_t M;
+    index_t N;
+    index_t K;
+    index_t stride_A;
+    index_t stride_B;
+    index_t stride_C;
+};
+```
+The data prepared in this way can be passed to the `invoke_gemm` function. This is a templated function that also takes three template parameters: `ALayout`, `BLayout`, and `CLayout`:
+```cpp
+// Example, API
+template <typename ALayout, typename BLayout, typename CLayout, bool Persistent>
+float invoke_gemm(int n_warmup,
+                  int n_repeat,
+                  int group_count,
+                  const std::vector<grouped_gemm_kargs>& args)
+```
+`invoke_gemm` returns the run time in milliseconds. The workspace memory required for computation is allocated. Workspace memory on the GPU refers to temporary memory buffers allocated when some operations are run. This extra space is needed to hold GEMM descriptions. The following structure can be used to allocate workspace:
+
+```cpp
+// Example
+ck_tile::DeviceMem gemm_workspace;
+gemm_workspace.Realloc(GetWorkspaceSize(args));
+```
+Finally the arguments are passed to group_gemm and the kernel is launched.
+```cpp
+// API
+template <typename ALayout, typename BLayout, typename CLayout>
+float grouped_gemm(const std::vector<grouped_gemm_kargs>& gemm_descs,
+                   const ck_tile::stream_config& s,
+                   void* kargs_ptr)
+```
+All the necessary parameters are set, the tiling is computed, the GEMM pipeline and epilogue are prepared, and the GroupedGemmKernel is launched.
+
+## Build
 ```
 # in the root of ck_tile
 mkdir build && cd build
@@ -16,10 +157,17 @@ This will result in an executable `build/bin/tile_example_grouped_gemm`
 ## example
 ```
 args:
-   -a_layout    Tensor A layout (default:R)
-   -b_layout    Tensor B layout (default:R)
-   -c_layout    Tensor C layout (default:R)
-          -v    0. No validation, 1. Validation on CPU
-     -warmup    number of iterations before benchmark the kernel (default:10)
-     -repeat    number of iterations to benchmark the kernel (default:100)
+ -Ms          M dimensions - (Default: empty).
+ -Ns          N dimensions - (Default: empty).
+ -Ks          K dimensions - (Default: empty).
+ -stride_As   Tensor A strides - (Default: empty).
+ -stride_Bs   Tensor B strides - (Default: empty).
+ -stride_Cs   Tensor C strides - (Default: empty).
+ -a_layout    A tensor data layout - (Default: Row).
+ -b_layout    B tensor data layout - (Default: Col).
+ -c_layout    C tensor data layout - (Default: Row).
+ -validate    0. No validation, 1. Validation on CPU. (Default: 1).
+ -warmup      Number of iterations before benchmark the kernel. (Default: 10).
+ -repeat      Number of iterations to benchmark the kernel. (Default: 100).
+ -group_count Group count. (Default: 16).
 ```
--- a/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp
+++ b/example/ck_tile/17_grouped_gemm/grouped_gemm.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.

 #include <hip/hip_runtime.h>

@@ -16,273 +16,115 @@
 #include "ck_tile/host.hpp"
 #include "grouped_gemm.hpp"

-std::size_t get_workspace_size(const std::vector<grouped_gemm_kargs>& gemm_descs)
+template <typename GemmConfig,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CDataType>
+float grouped_gemm_tileloop(const ck_tile::stream_config& s,
+                            const ck_tile::index_t num_groups,
+                            void* kargs_ptr,
+                            bool splitk)
 {
-    return gemm_descs.size() * sizeof(ck_tile::GemmTransKernelArg);
-}
-
-template <typename ALayout, typename BLayout, typename CLayout>
-float grouped_gemm(const std::vector<grouped_gemm_kargs>& gemm_descs,
-                   const ck_tile::stream_config& s,
-                   void* p_workspace_)
-{
-#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
-    // Memory friendly for Interwave scheduler
-    constexpr ck_tile::index_t M_Tile = 128;
-    constexpr ck_tile::index_t N_Tile = 32;
-    constexpr ck_tile::index_t K_Tile = 64;
-
-    constexpr ck_tile::index_t M_Warp = 4;
-    constexpr ck_tile::index_t N_Warp = 1;
-    constexpr ck_tile::index_t K_Warp = 1;
-
-    constexpr ck_tile::index_t M_Warp_Tile = 32;
-    constexpr ck_tile::index_t N_Warp_Tile = 32;
-    constexpr ck_tile::index_t K_Warp_Tile = 8;
-
-    constexpr bool DoubleSmemBuffer = false;
-#endif
-#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3)
-    // Compute friendly for Intrawave scheduler
-    constexpr ck_tile::index_t M_Tile = 256;
-    constexpr ck_tile::index_t N_Tile = 256;
-    constexpr ck_tile::index_t K_Tile = 64;
-
-    constexpr ck_tile::index_t M_Warp = 2;
-    constexpr ck_tile::index_t N_Warp = 2;
-    constexpr ck_tile::index_t K_Warp = 1;
-
-    constexpr ck_tile::index_t M_Warp_Tile = 32;
-    constexpr ck_tile::index_t N_Warp_Tile = 32;
-    constexpr ck_tile::index_t K_Warp_Tile = 16;
-
-    constexpr bool DoubleSmemBuffer = false;
-#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V4)
-    // Compute friendly for Intrawave scheduler
-    // Using the ping pong reader in the lds level
-    constexpr ck_tile::index_t M_Tile = 256;
-    constexpr ck_tile::index_t N_Tile = 256;
-    constexpr ck_tile::index_t K_Tile = 32;
-
-    constexpr ck_tile::index_t M_Warp = 2;
-    constexpr ck_tile::index_t N_Warp = 2;
-    constexpr ck_tile::index_t K_Warp = 1;
-
-    constexpr ck_tile::index_t M_Warp_Tile = 32;
-    constexpr ck_tile::index_t N_Warp_Tile = 32;
-    constexpr ck_tile::index_t K_Warp_Tile = 16;
-
-    constexpr bool DoubleSmemBuffer = true;
-#endif
-
    constexpr bool kPadM = false;
    constexpr bool kPadN = false;
    constexpr bool kPadK = false;

-    constexpr bool TransposeC = false;
-
-    constexpr int kBlockPerCu                         = 1;
    constexpr ck_tile::index_t TileParitionerGroupNum = 8;
    constexpr ck_tile::index_t TileParitionerM01      = 4;

-    using GemmShape =
-        ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
-                               ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
-                               ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
+    using GemmShape = ck_tile::TileGemmShape<
+        ck_tile::sequence<GemmConfig::M_Tile, GemmConfig::N_Tile, GemmConfig::K_Tile>,
+        ck_tile::sequence<GemmConfig::M_Warp, GemmConfig::N_Warp, GemmConfig::K_Warp>,
+        ck_tile::
+            sequence<GemmConfig::M_Warp_Tile, GemmConfig::N_Warp_Tile, GemmConfig::K_Warp_Tile>>;
    using TilePartitioner = ck_tile::
        GemmSpatiallyLocalTilePartitioner<GemmShape, TileParitionerGroupNum, TileParitionerM01>;

    using Traits = ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
-    using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<kPadM,
-                                                                 kPadN,
-                                                                 kPadK,
-                                                                 DoubleSmemBuffer,
-                                                                 ALayout,
-                                                                 BLayout,
-                                                                 CLayout,
-                                                                 TransposeC>;
+    using GemmUniversalTraits =
+        ck_tile::PersistentTileGemmUniversalTraits<GemmConfig::kPadM,
+                                                   GemmConfig::kPadN,
+                                                   GemmConfig::kPadK,
+                                                   GemmConfig::DoubleSmemBuffer,
+                                                   ALayout,
+                                                   BLayout,
+                                                   CLayout>;
    using GemmPipelineProblem =
        ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;

-    using BaseGemmPipeline = UNIVERSAL_GEMM_PIPELINE<GemmPipelineProblem>;
-
-    const ck_tile::index_t k_grain     = gemm_descs[0].k_batch * K_Tile;
-    const ck_tile::index_t K_split     = (gemm_descs[0].K + k_grain - 1) / k_grain * K_Tile;
-    const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(K_split);
-    const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
-    const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
-
    float ave_time{0};

-    const auto Run = [&](const auto has_hot_loop_, const auto tail_number_) {
-        constexpr bool has_hot_loop_v = has_hot_loop_.value;
-        constexpr auto tail_number_v  = tail_number_.value;
-        constexpr auto scheduler      = GEMM_PIPELINE_SCHEDULER;
+    const auto Run = [&](const auto memory_operation_) {
+        constexpr auto scheduler        = GemmConfig::Scheduler;
+        constexpr auto memory_operation = memory_operation_.value;

+        // We create the GEMM pipeline without specifying hotloop or tailnumber.
+        // These are automatically run inside the kernel based on the given input data.
        using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
                                                                           BDataType,
                                                                           AccDataType,
                                                                           GemmShape,
                                                                           GemmUniversalTraits,
-                                                                           scheduler,
-                                                                           has_hot_loop_v,
-                                                                           tail_number_v>;
+                                                                           scheduler>;

-        using GemmPipeline = GEMM_PIPELINE<UniversalGemmProblem>;
+        using GemmPipeline = typename PipelineTypeTraits<
+            GemmConfig::Pipeline>::template GemmPipeline<UniversalGemmProblem>;
        using GemmEpilogue = ck_tile::CShuffleEpilogue<
            ck_tile::CShuffleEpilogueProblem<ADataType,
                                             BDataType,
+                                             ck_tile::tuple<>,
                                             AccDataType,
                                             CDataType,
+                                             ck_tile::tuple<>,
                                             CLayout,
+                                             ck_tile::element_wise::PassThrough,
                                             GemmPipelineProblem::kBlockSize,
                                             TilePartitioner::MPerBlock,
                                             TilePartitioner::NPerBlock,
-                                             M_Warp,
-                                             N_Warp,
-                                             M_Warp_Tile,
-                                             N_Warp_Tile,
-                                             K_Warp_Tile,
-                                             UniversalGemmProblem::TransposeC>>;
+                                             GemmConfig::M_Warp,
+                                             GemmConfig::N_Warp,
+                                             GemmConfig::M_Warp_Tile,
+                                             GemmConfig::N_Warp_Tile,
+                                             GemmConfig::K_Warp_Tile,
+                                             UniversalGemmProblem::TransposeC,
+                                             memory_operation>>;
        using Kernel = ck_tile::GroupedGemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
-        auto kargs   = Kernel::MakeKargs(gemm_descs);
-
-        const dim3 grids      = Kernel::GridSize(gemm_descs);
        constexpr dim3 blocks = Kernel::BlockSize();
-
-        ck_tile::hip_check_error(hipMemcpyWithStream(p_workspace_,
-                                                     kargs.data(),
-                                                     get_workspace_size(gemm_descs),
-                                                     hipMemcpyHostToDevice,
-                                                     s.stream_id_));
+        const dim3 grids      = Kernel::MaxOccupancyGridSize(s);

        if(s.log_level_ > 0)
        {
-            std::cout << "Launching kernel: " << Kernel::GetName() << " with args:"
-                      << " grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
-                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
-                      << std::endl;
+            std::cout << "Launching kernel: " << Kernel::GetName() << " with args:" << " grid: {"
+                      << grids.x << ", " << grids.y << ", " << grids.z << "}" << ", blocks: {"
+                      << blocks.x << ", " << blocks.y << ", " << blocks.z << "}" << std::endl;
        }

-        ave_time = ck_tile::launch_kernel(
-            s,
-            ck_tile::make_kernel<blocks.x, kBlockPerCu>(
-                Kernel{},
-                grids,
-                blocks,
-                0,
-                ck_tile::cast_pointer_to_constant_address_space(p_workspace_),
-                gemm_descs.size()));
+        ave_time =
+            ck_tile::launch_kernel(s,
+                                   ck_tile::make_kernel<blocks.x, GemmConfig::kBlockPerCu>(
+                                       Kernel{},
+                                       grids,
+                                       blocks,
+                                       0,
+                                       ck_tile::cast_pointer_to_constant_address_space(kargs_ptr),
+                                       num_groups));
+
        return ave_time;
    };

-    if(has_hot_loop)
+    if(!splitk)
    {
-#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3)
-        if(tail_num == ck_tile::TailNumber::Full)
-        {
-            Run(ck_tile::bool_constant<true>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
-        }
-        else if(tail_num == ck_tile::TailNumber::Odd)
-        {
-            Run(ck_tile::bool_constant<true>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Odd>{});
-        }
-        else if(tail_num == ck_tile::TailNumber::Even)
-        {
-            Run(ck_tile::bool_constant<true>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Even>{});
-        }
-        else
-        {
-            std::ostringstream err;
-            err << "Incorrect tail_num for compv3 pipeline! Expected Full, Odd or Even, but got "
-                << tail_num << "\nPrefetchStages: " << BaseGemmPipeline::PrefetchStages
-                << "\n File: " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__;
-            throw std::runtime_error(err.str());
-        }
-#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
-        // Tail pipeline One to Seven
-        if(tail_num == ck_tile::TailNumber::One)
-        {
-            Run(ck_tile::bool_constant<true>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::One>{});
-        }
-        else if(tail_num == ck_tile::TailNumber::Full)
-        {
-            Run(ck_tile::bool_constant<true>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Full>{});
-        }
-
-        if constexpr(BaseGemmPipeline::PrefetchStages > 2)
-        {
-            if(tail_num == ck_tile::TailNumber::Two)
-            {
-                Run(ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Two>{});
-            }
-        }
-        if constexpr(BaseGemmPipeline::PrefetchStages > 3)
-        {
-            if(tail_num == ck_tile::TailNumber::Three)
-            {
-                Run(ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Three>{});
-            }
-        }
-        if constexpr(BaseGemmPipeline::PrefetchStages > 4)
-        {
-            if(tail_num == ck_tile::TailNumber::Four)
-            {
-                Run(ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Four>{});
-            }
-        }
-        if constexpr(BaseGemmPipeline::PrefetchStages > 5)
-        {
-            if(tail_num == ck_tile::TailNumber::Five)
-            {
-                Run(ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Five>{});
-            }
-        }
-        if constexpr(BaseGemmPipeline::PrefetchStages > 6)
-        {
-            if(tail_num == ck_tile::TailNumber::Six)
-            {
-                Run(ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Six>{});
-            }
-        }
-        if constexpr(BaseGemmPipeline::PrefetchStages > 7)
-        {
-            if(tail_num == ck_tile::TailNumber::Seven)
-            {
-                Run(ck_tile::bool_constant<true>{},
-                    ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Seven>{});
-            }
-        }
-#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V4)
-        if(tail_num == ck_tile::TailNumber::Three)
-        {
-            Run(ck_tile::bool_constant<true>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Three>{});
-        }
-        else
-        {
-            Run(ck_tile::bool_constant<true>{},
-                ck_tile::integral_constant<ck_tile::TailNumber, ck_tile::TailNumber::Two>{});
-        }
-#endif
+        Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                       ck_tile::memory_operation_enum::set>{});
    }
    else
    {
-        std::ostringstream err;
-        err << "Incorrect tail_num for pipeline without hotloop, expected Full, Odd or Even, but "
-            << "got " << tail_num << "\n PrefetchStages: " << BaseGemmPipeline::PrefetchStages
-            << "\n File: " << __FILE__ << ":" << __LINE__ << ", in function: " << __func__;
-        throw std::runtime_error(err.str());
+        Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                       ck_tile::memory_operation_enum::atomic_add>{});
    }

    return ave_time;
@@ -290,4 +132,8 @@ float grouped_gemm(const std::vector<grouped_gemm_kargs>& gemm_descs,

 #include "run_grouped_gemm_example.inc"

-int main(int argc, char* argv[]) { return !run_grouped_gemm_example(argc, argv); }
+constexpr bool Persistent = true;
+int main(int argc, char* argv[])
+{
+    return !run_grouped_gemm_example<Persistent, GemmConfigComputeV4>(argc, argv);
+}
--- a/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp
+++ b/example/ck_tile/17_grouped_gemm/grouped_gemm.hpp
@@ -7,7 +7,8 @@

 #include "ck_tile/core.hpp"
 #include "ck_tile/host/kernel_launch.hpp"
-#include "ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp"
+#include "ck_tile/ops/gemm.hpp"
+#include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp"

 #define CK_TILE_PIPELINE_COMPUTE_V3 1
 #define CK_TILE_PIPELINE_MEMORY 2
@@ -17,21 +18,23 @@
 #define CK_TILE_PIPELINE_DEFAULT CK_TILE_PIPELINE_COMPUTE_V3
 #endif

-#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
-#define GEMM_PIPELINE ck_tile::GemmPipelineAgBgCrMem
-#define UNIVERSAL_GEMM_PIPELINE ck_tile::BaseGemmPipelineAgBgCrMem
-#define GEMM_PIPELINE_SCHEDULER ck_tile::GemmPipelineScheduler::Interwave
-#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3)
-#define GEMM_PIPELINE ck_tile::GemmPipelineAgBgCrCompV3
-#define UNIVERSAL_GEMM_PIPELINE ck_tile::BaseGemmPipelineAgBgCrCompV3
-#define GEMM_PIPELINE_SCHEDULER ck_tile::GemmPipelineScheduler::Intrawave
-#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V4)
-#define GEMM_PIPELINE ck_tile::GemmPipelineAgBgCrCompV4
-#define UNIVERSAL_GEMM_PIPELINE ck_tile::BaseGemmPipelineAgBgCrCompV4
-#define GEMM_PIPELINE_SCHEDULER ck_tile::GemmPipelineScheduler::Intrawave
+template <typename PrecType, ck_tile::index_t M_Warp_Tile>
+constexpr ck_tile::index_t get_k_warp_tile()
+{
+#if defined(CK_GFX950_SUPPORT)
+    constexpr bool is_8bit_float =
+        std::is_same_v<PrecType, ck_tile::fp8_t> || std::is_same_v<PrecType, ck_tile::bf8_t>;
+    if constexpr(M_Warp_Tile == 32)
+        return is_8bit_float ? 64 : 16;
+    else
+        return is_8bit_float ? 128 : 32;
 #else
-#error "unsupported CK_TILE_PIPELINE_DEFAULT value"
+    if constexpr(M_Warp_Tile == 32)
+        return 16;
+    else
+        return 32;
 #endif
+}

 template <typename DataType>
 struct GemmTypeConfig;
@@ -45,15 +48,111 @@ struct GemmTypeConfig<ck_tile::half_t>
    using AccDataType = float;
 };

-using Types = GemmTypeConfig<ck_tile::half_t>;
+template <>
+struct GemmTypeConfig<ck_tile::fp8_t>
+{
+    using ADataType   = ck_tile::fp8_t;
+    using BDataType   = ck_tile::fp8_t;
+    using AccDataType = float;
+    using CDataType   = ck_tile::half_t;
+};

-// Specific type aliases for easy access
-using ADataType   = Types::ADataType;
-using BDataType   = Types::BDataType;
-using AccDataType = Types::AccDataType;
-using CDataType   = Types::CDataType;
+struct GemmConfigBase
+{
+    static constexpr bool kPadM = false;
+    static constexpr bool kPadN = false;
+    static constexpr bool kPadK = false;

-using grouped_gemm_kargs = ck_tile::GemmHostArgs;
+    static constexpr bool PermuteA = false;
+    static constexpr bool PermuteB = false;
+
+    static constexpr bool TransposeC            = false;
+    static constexpr bool UseStructuredSparsity = false;
+
+    static constexpr int kBlockPerCu                         = 1;
+    static constexpr ck_tile::index_t TileParitionerGroupNum = 8;
+    static constexpr ck_tile::index_t TileParitionerM01      = 4;
+    static constexpr auto Scheduler                 = ck_tile::GemmPipelineScheduler::Intrawave;
+    static constexpr ck_tile::index_t Pipeline      = CK_TILE_PIPELINE_COMPUTE_V3;
+    static constexpr ck_tile::index_t NumWaveGroups = 1;
+    static constexpr bool Preshuffle                = false;
+};
+
+template <typename PrecType>
+struct GemmConfigComputeV3_2 : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 128;
+    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 2;
+    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
+
+    static constexpr bool DoubleSmemBuffer     = false;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V3;
+
+    static constexpr int kBlockPerCu = 1;
+};
+
+template <typename PrecType>
+struct GemmConfigComputeV4 : public GemmConfigBase
+{
+    // Compute V4 only support Intrawave scheduler
+    // Using the ping pong reader in the lds level
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 128;
+    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 2;
+    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
+
+    static constexpr bool DoubleSmemBuffer     = true;
+    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V4;
+
+    static constexpr int kBlockPerCu = 2;
+};
+
+template <ck_tile::index_t PipelineId>
+struct PipelineTypeTraits;
+
+template <>
+struct PipelineTypeTraits<CK_TILE_PIPELINE_MEMORY>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::GemmPipelineAgBgCrMem<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrMem<PipelineProblem>;
+};
+
+template <>
+struct PipelineTypeTraits<CK_TILE_PIPELINE_COMPUTE_V3>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV3<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV3<PipelineProblem>;
+};
+
+template <>
+struct PipelineTypeTraits<CK_TILE_PIPELINE_COMPUTE_V4>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV4<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV4<PipelineProblem>;
+};
+
+using grouped_gemm_kargs = ck_tile::GroupedGemmHostArgs;

 auto create_args(int argc, char* argv[])
 {
@@ -68,16 +167,45 @@ auto create_args(int argc, char* argv[])
        .insert("b_layout", "C", "B tensor data layout - Row by default.")
        .insert("c_layout", "R", "C tensor data layout - Row by default.")
        .insert("validate", "1", "0. No validation, 1. Validation on CPU.")
+        .insert("prec", "fp16", "data type. fp16/bf16/fp8/bf8")
        .insert("warmup", "10", "number of iterations before benchmark the kernel.")
        .insert("repeat", "100", "number of iterations to benchmark the kernel.")
-        .insert("group_count", "8", "group count.");
+        .insert("group_count", "8", "group count.")
+        .insert("kbatch", "1", "kbatch for SplitK");

    bool result = arg_parser.parse(argc, argv);
    return std::make_tuple(result, arg_parser);
 }

-std::size_t get_workspace_size(const std::vector<grouped_gemm_kargs>& gemm_descs);
+inline std::size_t get_workspace_size(const std::vector<grouped_gemm_kargs>& gemm_descs)
+{
+    return gemm_descs.size() * sizeof(ck_tile::GemmTransKernelArg);
+}

+template <typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename CLayout,
+          bool Persistent,
+          typename CDEElementWise>
 float grouped_gemm(const std::vector<grouped_gemm_kargs>& gemm_descs,
                   const ck_tile::stream_config& s,
-                   void* p_workspace_);
+                   void* kargs_ptr);
+
+template <typename GemmConfig,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CDataType>
+float grouped_gemm_tileloop(const ck_tile::stream_config& s,
+                            const ck_tile::index_t num_groups,
+                            void* kargs_ptr,
+                            bool splitk = false);
--- a/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc
+++ b/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc
@@ -10,6 +10,7 @@ static constexpr inline auto is_row_major(Layout layout_)
                                                 ck_tile::tensor_layout::gemm::RowMajor>>{};
 }

+template <typename ADataType, typename BDataType, typename AccDataType, typename CDataType>
 auto calculate_rtol_atol(const ck_tile::index_t K,
                         const ck_tile::index_t kbatch,
                         const float max_accumulated_value)
@@ -30,20 +31,88 @@ auto calculate_rtol_atol(const ck_tile::index_t K,
    return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
 }

-template <typename ALayout, typename BLayout, typename CLayout>
+template <typename GemmConfig,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename CLayout,
+          bool Persistent,
+          typename CDEElementWise = ck_tile::element_wise::PassThrough>
 float invoke_gemm(int n_warmup,
                  int n_repeat,
                  int group_count,
                  const std::vector<grouped_gemm_kargs>& args)
 {
-
+    // Workspace memory allocated to hold the gemm descriptions.
    ck_tile::DeviceMem gemm_workspace;
    gemm_workspace.Realloc(get_workspace_size(args));

-    float ave_time = grouped_gemm<ALayout, BLayout, CLayout>(
-        args,
-        ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat},
-        gemm_workspace.GetDeviceBuffer());
+    float ave_time = 0;
+    if constexpr(!Persistent)
+    {
+        // Regular version of grouped gemm
+        ave_time = grouped_gemm<ADataType,
+                                BDataType,
+                                DsDataType,
+                                AccDataType,
+                                CDataType,
+                                ALayout,
+                                BLayout,
+                                DsLayout,
+                                CLayout,
+                                CDEElementWise>(
+            args,
+            ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat},
+            gemm_workspace.GetDeviceBuffer());
+    }
+    else
+    {
+        // NOTE: With the persistent TileLoop kernel, we do not necessarily need to have
+        // the gemm problems known on the host. Instead, we can just pass the pointer
+        // to the kernel and let the workgroups figure out which tiles to work on.
+        // This is useful when the gemm problems are generated dynamically.
+        // In this example however, we generate the `kargs` using the known gemm_descs,
+        // and copy the gemm descriptions to the device memory.
+        // The contents of the memory pointed to by `kargs_ptr` pointer could be
+        // written by e.g. another kernel from earlier stage.
+        std::vector<ck_tile::GemmTransKernelArg> kargs;
+        void* kargs_ptr   = gemm_workspace.GetDeviceBuffer();
+        const bool splitk = args[0].k_batch > 1;
+        for(const auto& arg : args)
+        {
+            kargs.emplace_back(ck_tile::UniversalGemmKernelArgs<>{{arg.a_ptr},
+                                                                  {arg.b_ptr},
+                                                                  {/*arg.ds_ptr*/},
+                                                                  arg.e_ptr,
+                                                                  arg.M,
+                                                                  arg.N,
+                                                                  arg.K,
+                                                                  {arg.stride_A},
+                                                                  {arg.stride_B},
+                                                                  {/*arg.stride_Ds*/},
+                                                                  arg.stride_E,
+                                                                  arg.k_batch});
+        }
+        const auto stream = ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat};
+        HIP_CHECK_ERROR(hipMemcpyWithStream(kargs_ptr,
+                                            kargs.data(),
+                                            kargs.size() * sizeof(ck_tile::GemmTransKernelArg),
+                                            hipMemcpyHostToDevice,
+                                            stream.stream_id_));
+        ave_time = grouped_gemm_tileloop<GemmConfig,
+                                         ALayout,
+                                         BLayout,
+                                         CLayout,
+                                         ADataType,
+                                         BDataType,
+                                         AccDataType,
+                                         CDataType>(stream, group_count, kargs_ptr, splitk);
+    }

    std::string op_name{"Grouped Gemm"};

@@ -66,7 +135,15 @@ float invoke_gemm(int n_warmup,
    return ave_time;
 }

-template <typename ALayout, typename BLayout, typename CLayout>
+template <bool Persistent,
+          typename GemmConfig,
+          typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename AccDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout>
 int run_grouped_gemm_example_with_layouts(int argc,
                                          char* argv[],
                                          const ALayout a_layout                  = ALayout{},
@@ -87,6 +164,15 @@ int run_grouped_gemm_example_with_layouts(int argc,
    const int group_count = arg_parser.get_int("group_count");
    const int repeat      = arg_parser.get_int("repeat");
    const int warmup      = arg_parser.get_int("warmup");
+    const int kbatch      = arg_parser.get_int("kbatch");
+    bool validate         = arg_parser.get_bool("validate");
+
+    if(kbatch > 1 && validate && warmup + repeat > 1)
+    {
+        std::cout << "WARNING: Data validation enabled with SplitK and more than"
+                  << "1 warmup/repeat. Disabling validation." << std::endl;
+        validate = false;
+    }

    std::vector<ck_tile::index_t> Ms        = arg_parser.get_int_vec("Ms");
    std::vector<ck_tile::index_t> Ns        = arg_parser.get_int_vec("Ns");
@@ -102,7 +188,7 @@ int run_grouped_gemm_example_with_layouts(int argc,
        {
            Ms.push_back(256 + 256 * i);
            Ns.push_back(256 + 512 * i);
-            Ks.push_back(256 + 64 * i);
+            Ks.push_back(512 + 128 * i);

            stride_As.push_back(Ks[i]);
            stride_Bs.push_back(Ks[i]);
@@ -146,12 +232,12 @@ int run_grouped_gemm_example_with_layouts(int argc,
        c_m_n_tensors.push_back(ck_tile::HostTensor<CDataType>(
            ck_tile::host_tensor_descriptor(M, N, stride_Cs[i], is_row_major(CLayout{}))));

-        std::cout << "gemm[" << i << "]"
-                  << " a_m_k: " << a_m_k_tensors[i].mDesc << " b_k_n: " << b_k_n_tensors[i].mDesc
-                  << " c_m_n: " << c_m_n_tensors[i].mDesc << std::endl;
+        std::cout << "gemm[" << i << "]" << " a_m_k: " << a_m_k_tensors[i].mDesc
+                  << " b_k_n: " << b_k_n_tensors[i].mDesc << " c_m_n: " << c_m_n_tensors[i].mDesc
+                  << std::endl;

-        ck_tile::FillUniformDistribution<ADataType>{-5.f, 5.f}(a_m_k_tensors[i]);
-        ck_tile::FillUniformDistribution<BDataType>{-5.f, 5.f}(b_k_n_tensors[i]);
+        ck_tile::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k_tensors[i]);
+        ck_tile::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n_tensors[i]);

        a_m_k_dev_buf.push_back(std::make_unique<ck_tile::DeviceMem>(
            a_m_k_tensors[i].get_element_space_size_in_bytes()));
@@ -169,13 +255,21 @@ int run_grouped_gemm_example_with_layouts(int argc,
        const void* p_b = b_k_n_dev_buf[i]->GetDeviceBuffer();
        void* p_c       = c_m_n_dev_buf[i]->GetDeviceBuffer();

-        // TODO Add support for kbatch > 1 in grouped gemm
-        static constexpr ck_tile::index_t k_batch = 1;
        gemm_descs.push_back(
-            {p_a, p_b, p_c, k_batch, M, N, K, stride_As[i], stride_Bs[i], stride_Cs[i]});
+            {p_a, p_b, p_c, kbatch, M, N, K, stride_As[i], stride_Bs[i], stride_Cs[i]});
    }

-    invoke_gemm<ALayout, BLayout, CLayout>(warmup, repeat, group_count, gemm_descs);
+    invoke_gemm<GemmConfig,
+                ADataType,
+                BDataType,
+                ck_tile::tuple<>,
+                AccDataType,
+                CDataType,
+                ALayout,
+                BLayout,
+                ck_tile::tuple<>,
+                CLayout,
+                Persistent>(warmup, repeat, group_count, gemm_descs);

    for(int i = 0; i < group_count; i++)
    {
@@ -183,7 +277,7 @@ int run_grouped_gemm_example_with_layouts(int argc,
    }

    bool pass{true};
-    if(arg_parser.get_int("validate"))
+    if(validate)
    {
        for(int i = 0; i < group_count; ++i)
        {
@@ -194,7 +288,9 @@ int run_grouped_gemm_example_with_layouts(int argc,
                a_m_k_tensors[i], b_k_n_tensors[i], c_m_n_host_ref);
            const float max_accumulated_value =
                *std::max_element(c_m_n_host_ref.mData.begin(), c_m_n_host_ref.mData.end());
-            const auto rtol_atol = calculate_rtol_atol(Ks[i], 1 /*kbatch*/, max_accumulated_value);
+            const auto rtol_atol =
+                calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
+                    Ks[i], kbatch, max_accumulated_value);
            pass &= ck_tile::check_err(c_m_n_tensors[i],
                                       c_m_n_host_ref,
                                       "Error: Incorrect results!",
@@ -211,6 +307,61 @@ int run_grouped_gemm_example_with_layouts(int argc,
    return pass;
 }

+template <bool Persistent, typename GemmConfig, typename PrecType>
+int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int argc, char* argv[])
+{
+    using Row   = ck_tile::tensor_layout::gemm::RowMajor;
+    using Col   = ck_tile::tensor_layout::gemm::ColumnMajor;
+    using Types = GemmTypeConfig<PrecType>;
+    // Specific type aliases for easy access
+    using ADataType   = typename Types::ADataType;
+    using BDataType   = typename Types::BDataType;
+    using AccDataType = typename Types::AccDataType;
+    using CDataType   = typename Types::CDataType;
+
+    if(a_layout == "R" && b_layout == "C")
+    {
+        return run_grouped_gemm_example_with_layouts<Persistent,
+                                                     GemmConfig,
+                                                     ADataType,
+                                                     BDataType,
+                                                     CDataType,
+                                                     AccDataType>(argc, argv, Row{}, Col{}, Row{});
+    }
+    else if(a_layout == "R" && b_layout == "R")
+    {
+        return run_grouped_gemm_example_with_layouts<Persistent,
+                                                     GemmConfig,
+                                                     ADataType,
+                                                     BDataType,
+                                                     CDataType,
+                                                     AccDataType>(argc, argv, Row{}, Row{}, Row{});
+    }
+    else if(a_layout == "C" && b_layout == "R")
+    {
+        return run_grouped_gemm_example_with_layouts<Persistent,
+                                                     GemmConfig,
+                                                     ADataType,
+                                                     BDataType,
+                                                     CDataType,
+                                                     AccDataType>(argc, argv, Col{}, Row{}, Row{});
+    }
+    else if(a_layout == "C" && b_layout == "C")
+    {
+        return run_grouped_gemm_example_with_layouts<Persistent,
+                                                     GemmConfig,
+                                                     ADataType,
+                                                     BDataType,
+                                                     CDataType,
+                                                     AccDataType>(argc, argv, Col{}, Col{}, Row{});
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported data layout configuration for A,B and C tensors!");
+    }
+}
+
+template <bool Persistent, template <typename PrecType> typename GemmConfig>
 int run_grouped_gemm_example(int argc, char* argv[])
 {
    auto [result, arg_parser] = create_args(argc, argv);
@@ -219,22 +370,22 @@ int run_grouped_gemm_example(int argc, char* argv[])
        return -1;
    }

-    const std::string a_layout = arg_parser.get_str("a_layout");
-    const std::string b_layout = arg_parser.get_str("b_layout");
+    const std::string a_layout  = arg_parser.get_str("a_layout");
+    const std::string b_layout  = arg_parser.get_str("b_layout");
+    const std::string data_type = arg_parser.get_str("prec");

-    using Row = ck_tile::tensor_layout::gemm::RowMajor;
-    using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
-
-    if(a_layout == "R" && b_layout == "C")
+    if(data_type == "fp16")
    {
-        return run_grouped_gemm_example_with_layouts(argc, argv, Row{}, Col{}, Row{});
+        return run_gemm_example_prec_type<Persistent, GemmConfig<ck_tile::half_t>, ck_tile::half_t>(
+            a_layout, b_layout, argc, argv);
+    }
+    else if(data_type == "fp8")
+    {
+        return run_gemm_example_prec_type<Persistent, GemmConfig<ck_tile::fp8_t>, ck_tile::fp8_t>(
+            a_layout, b_layout, argc, argv);
    }
-    // else if(a_layout == "R" && b_layout == "R")
-    // {
-    //     return run_grouped_gemm_example_with_layouts(argc, argv, Row{}, Row{}, Row{});
-    // }
    else
    {
-        throw std::runtime_error("Unsupported data layout configuration for A,B and C tensors!");
+        throw std::runtime_error("Unsupported data type configuration.");
    }
 }
--- a/example/ck_tile/18_flatmm/CMakeLists.txt
+++ b/example/ck_tile/18_flatmm/CMakeLists.txt
@@ -0,0 +1,6 @@
+add_executable(tile_example_flatmm_basic EXCLUDE_FROM_ALL flatmm_basic.cpp)
+
+set(EXAMPLE_FLATMM_COMPILE_OPTIONS)
+# list(APPEND EXAMPLE_FLATMM_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal)
+# list(APPEND EXAMPLE_FLATMM_COMPILE_OPTIONS -Wno-unused-variable -Wno-unused-parameter)
+target_compile_options(tile_example_flatmm_basic PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS})
--- a/example/ck_tile/18_flatmm/README.md
+++ b/example/ck_tile/18_flatmm/README.md
@@ -0,0 +1,35 @@
+# FLATMM Matrix Multiplication
+
+This folder contains example for FLATMM using ck_tile tile-programming implementation. Currently, it only supports the basic feature of the CK Tile FLATMM, but creates the placeholders for the future support on different FLATMM pipeline and different FLATMM modules. In the near future, we will gradually migrate all the FLATMM features from old CK to CK Tile.
+
+## build
+```
+# in the root of ck_tile
+mkdir build && cd build
+# you can replace <arch> with the appropriate architecture (for example gfx90a or gfx942) or leave it blank
+sh ../script/cmake-ck-dev.sh  ../ <arch>
+# The basic pipeline method on the flatmm calculation
+make tile_example_flatmm_basic -j
+```
+This will result in an executable `build/bin/tile_example_flatmm_basic`
+
+## example
+```
+args:
+          -b    batch size (default:1)
+          -m    m dimension (default:1024)
+          -n    n dimension (default:2048)
+          -k    k dimension (default:64)
+   -a_layout    Tensor A data layout (default: R)
+   -b_layout    Tensor B data layout (default: R)
+   -c_layout    Tensor C data layout (default: R)
+   -stride_a    Tensor A stride (default:0)
+   -stride_b    Tensor B stride (default:0)
+   -stride_c    Tensor C stride (default:0)
+          -v    0. No validation, 1. Validation on CPU, 2. Validation on GPU (default:2)
+          -e    Absolute error tolerance (default:1e-5)
+       -prec    data type. fp16/bf16/fp8/bf8 (default:fp16)
+     -warmup    number of iterations before benchmark the kernel (default:10)
+     -repeat    number of iterations to benchmark the kernel (default:100)
+      -timer    gpu:gpu timer, cpu:cpu timer (default:gpu)
+```
--- a/example/ck_tile/18_flatmm/flatmm_basic.cpp
+++ b/example/ck_tile/18_flatmm/flatmm_basic.cpp
@@ -0,0 +1,297 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <hip/hip_runtime.h>
+
+#include <cstring>
+#include <iostream>
+#include <ostream>
+#include <string>
+#include <tuple>
+
+#include "ck_tile/host.hpp"
+#include "flatmm_basic.hpp"
+#include "run_flatmm_example.inc"
+
+template <typename FlatmmConfig,
+          typename ADataType,
+          typename BDataType,
+          typename DsDatatype,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          bool persistent,
+          typename CDEElementWise>
+float flatmm_calc(const ck_tile::FlatmmHostArgs<>& args, const ck_tile::stream_config& s)
+{
+    using CodegenFlatmmShape = ck_tile::TileGemmShape<
+        ck_tile::sequence<FlatmmConfig::M_Tile, FlatmmConfig::N_Tile, FlatmmConfig::K_Tile>,
+        ck_tile::sequence<FlatmmConfig::M_Warp, FlatmmConfig::N_Warp, FlatmmConfig::K_Warp>,
+        ck_tile::sequence<FlatmmConfig::M_Warp_Tile,
+                          FlatmmConfig::N_Warp_Tile,
+                          FlatmmConfig::K_Warp_Tile>>;
+
+    using TilePartitioner =
+        ck_tile::GemmSpatiallyLocalTilePartitioner<CodegenFlatmmShape,
+                                                   FlatmmConfig::TileParitionerGroupNum,
+                                                   FlatmmConfig::TileParitionerM01>;
+
+    using Traits = ck_tile::TileGemmTraits<FlatmmConfig::kPadM,
+                                           FlatmmConfig::kPadN,
+                                           FlatmmConfig::kPadK,
+                                           ALayout,
+                                           BLayout,
+                                           ELayout,
+                                           FlatmmConfig::NumWaveGroups>;
+
+    using CodegenGemmTraits = ck_tile::TileGemmUniversalTraits<FlatmmConfig::kPadM,
+                                                               FlatmmConfig::kPadN,
+                                                               FlatmmConfig::kPadK,
+                                                               FlatmmConfig::DoubleSmemBuffer,
+                                                               ALayout,
+                                                               BLayout,
+                                                               ELayout,
+                                                               FlatmmConfig::TransposeC,
+                                                               FlatmmConfig::UseStructuredSparsity,
+                                                               persistent,
+                                                               FlatmmConfig::NumWaveGroups,
+                                                               true>;
+
+    using GemmPipelineProblem =
+        ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, CodegenFlatmmShape, Traits>;
+
+    using BaseGemmPipeline = ck_tile::BaseFlatmmPipelineAGmemBGmemCRegV1<GemmPipelineProblem>;
+
+    const ck_tile::index_t k_grain     = args.k_batch * FlatmmConfig::K_Tile;
+    const ck_tile::index_t K_split     = (args.K + k_grain - 1) / k_grain * FlatmmConfig::K_Tile;
+    const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(K_split);
+    const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
+    const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
+    float ave_time{0};
+
+    const auto Run = [&](const auto has_hot_loop_,
+                         const auto tail_number_,
+                         const auto memory_operation_) {
+        constexpr bool has_hot_loop_v   = has_hot_loop_.value;
+        constexpr auto tail_number_v    = tail_number_.value;
+        constexpr auto scheduler        = FlatmmConfig::Scheduler;
+        constexpr auto memory_operation = memory_operation_.value;
+
+        using CodegenPipelineProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                                             BDataType,
+                                                                             AccDataType,
+                                                                             CodegenFlatmmShape,
+                                                                             CodegenGemmTraits,
+                                                                             scheduler,
+                                                                             has_hot_loop_v,
+                                                                             tail_number_v>;
+
+        using CodegenFlatmmPipeline =
+            ck_tile::FlatmmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem>;
+
+        using GemmEpilogue = ck_tile::CShuffleEpilogue<
+            ck_tile::CShuffleEpilogueProblem<ADataType,
+                                             BDataType,
+                                             DsDatatype,
+                                             AccDataType,
+                                             CDataType,
+                                             DsLayout,
+                                             ELayout,
+                                             CDEElementWise,
+                                             CodegenPipelineProblem::kBlockSize,
+                                             TilePartitioner::MPerBlock,
+                                             TilePartitioner::NPerBlock,
+                                             FlatmmConfig::M_Warp,
+                                             FlatmmConfig::N_Warp,
+                                             FlatmmConfig::M_Warp_Tile,
+                                             FlatmmConfig::N_Warp_Tile,
+                                             FlatmmConfig::K_Warp_Tile,
+                                             CodegenPipelineProblem::TransposeC,
+                                             memory_operation,
+                                             FlatmmConfig::NumWaveGroups>>;
+
+        // ToDo: Will add the codegen part to test different pipeline policies in GEMM.
+        // Now we only use the BlockGemmASmemBSmemCRegV1DefaultPolicy.
+        using Kernel = ck_tile::FlatmmKernel<TilePartitioner, CodegenFlatmmPipeline, GemmEpilogue>;
+
+        auto kargs = Kernel::MakeKernelArgs(args);
+
+        const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch);
+        constexpr dim3 blocks = Kernel::BlockSize();
+
+        if(!Kernel::IsSupportedArgument(kargs))
+        {
+            throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
+        }
+
+        if(s.log_level_ > 0)
+        {
+            std::cout << "Launching kernel with args:" << CodegenFlatmmShape::GetName() << "\n"
+                      << "Shape: " << CodegenFlatmmShape::GetName() << "\n"
+                      << "problem: " << CodegenPipelineProblem::GetName() << "\n"
+                      << "pipeline: " << CodegenFlatmmPipeline::GetName() << "\n"
+                      << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
+                      << std::endl;
+        }
+
+        if(s.flush_cache_)
+        {
+            std::cout << "Flushing cache..." << std::endl;
+            static constexpr ck_tile::index_t APackedSize =
+                std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
+            static constexpr ck_tile::index_t BPackedSize =
+                std::is_same_v<BDataType, ck_tile::pk_int4_t> ? 2 : 1;
+
+            ck_tile::HostTensor<ADataType> a_m(ck_tile::host_tensor_descriptor(
+                args.M, args.K, args.stride_A, is_row_major(ALayout{})));
+            ck_tile::HostTensor<BDataType> b_n(ck_tile::host_tensor_descriptor(
+                args.K, args.N, args.stride_B, is_row_major(BLayout{})));
+
+            auto size_a_buffer = a_m.get_element_space_size_in_bytes() / APackedSize;
+            auto size_b_buffer = b_n.get_element_space_size_in_bytes() / BPackedSize;
+
+            ck_tile::RotatingMemWrapper<ADataType, BDataType> rotating_mem(
+                kargs.a_ptr, kargs.b_ptr, s.rotating_count_, size_a_buffer, size_b_buffer);
+            rotating_mem.Print();
+
+            auto run_flush_cache = [&]() {
+                // flush icache
+                ck_tile::flush_icache();
+                // rotating mem
+                rotating_mem.Next();
+                // clear c mem
+                if(args.k_batch > 1)
+                    hipGetErrorString(hipMemsetAsync(
+                        args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
+            };
+            ave_time = ck_tile::launch_kernel_time_mask(
+                s,
+                run_flush_cache,
+                ck_tile::make_kernel<blocks.x, FlatmmConfig::kBlockPerCu>(
+                    Kernel{}, grids, blocks, 0, kargs));
+        }
+        else
+        {
+            ave_time =
+                ck_tile::launch_kernel(s,
+                                       ck_tile::make_kernel<blocks.x, FlatmmConfig::kBlockPerCu>(
+                                           Kernel{}, grids, blocks, 0, kargs));
+        }
+        return ave_time;
+    };
+
+    const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
+        if(args.k_batch == 1)
+        {
+            Run(has_hot_loop_,
+                tail_number_,
+                ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                           ck_tile::memory_operation_enum::set>{});
+        }
+        else
+        {
+            Run(has_hot_loop_,
+                tail_number_,
+                ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                           ck_tile::memory_operation_enum::atomic_add>{});
+        }
+    };
+    BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);
+    return ave_time;
+}
+
+template <template <typename PreType> typename FlatmmConfig>
+int run_flatmm_example(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    using Row = ck_tile::tensor_layout::gemm::RowMajor;
+    using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
+
+    std::string data_type = arg_parser.get_str("prec");
+    std::string a_layout  = arg_parser.get_str("a_layout");
+    std::string b_layout  = arg_parser.get_str("b_layout");
+
+    int k        = arg_parser.get_int("k");
+    int stride_b = arg_parser.get_int("stride_b");
+
+    if(b_layout == "C" && stride_b > k)
+    {
+        throw std::runtime_error(
+            "For ColumnMajor layout, StrideB must be smaller than or equal to K (" +
+            std::to_string(k) + ")");
+    }
+
+    if(a_layout == "R" && b_layout == "C")
+    {
+
+        if(data_type == "fp16")
+        {
+            run_flatmm_example_with_layouts<ck_tile::half_t, FlatmmConfig<ck_tile::half_t>>(
+                argc, argv, Row{}, Col{}, Row{});
+        }
+        else if(data_type == "bf16")
+        {
+            run_flatmm_example_with_layouts<ck_tile::bf16_t, FlatmmConfig<ck_tile::bf16_t>>(
+                argc, argv, Row{}, Col{}, Row{});
+        }
+        else if(data_type == "fp8")
+        {
+            run_flatmm_example_with_layouts<ck_tile::fp8_t, FlatmmConfig<ck_tile::fp8_t>>(
+                argc, argv, Row{}, Col{}, Row{});
+        }
+        else if(data_type == "bf8")
+        {
+            run_flatmm_example_with_layouts<ck_tile::bf8_t, FlatmmConfig<ck_tile::bf8_t>>(
+                argc, argv, Row{}, Col{}, Row{});
+        }
+        else
+        {
+            throw std::runtime_error("Unsupported data_type!");
+        }
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported data layout configuration for A,B and C tensors!");
+    }
+    return -1;
+}
+
+int main(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return EXIT_FAILURE;
+
+    try
+    {
+        int warp_tile = arg_parser.get_int("warp_tile");
+        if(warp_tile == 0)
+        {
+            return !run_flatmm_example<FlatmmConfig16>(argc, argv);
+        }
+        else if(warp_tile == 1)
+        {
+            return !run_flatmm_example<FlatmmConfig32>(argc, argv);
+        }
+        else if(warp_tile == 2)
+        {
+            return !run_flatmm_example<FlatmmConfig16_950>(argc, argv);
+        }
+        else
+        {
+            return !run_flatmm_example<FlatmmConfig32_950>(argc, argv);
+        }
+    }
+    catch(const std::runtime_error& e)
+    {
+        std::cerr << "Runtime error: " << e.what() << '\n';
+        return EXIT_FAILURE;
+    }
+}
--- a/example/ck_tile/18_flatmm/flatmm_basic.hpp
+++ b/example/ck_tile/18_flatmm/flatmm_basic.hpp
@@ -0,0 +1,202 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <string>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/epilogue.hpp"
+#include "ck_tile/ops/flatmm.hpp"
+#include "ck_tile/ops/gemm.hpp"
+
+// GEMM config with 32x132 warp tile
+template <typename DataType>
+struct FlatmmConfig32
+{
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 128;
+    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(DataType);
+
+    static constexpr ck_tile::index_t M_Warp = 1;
+    static constexpr ck_tile::index_t N_Warp = 4;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = sizeof(DataType) == 2 ? 16 : 32;
+
+    static constexpr bool kPadM = false;
+    static constexpr bool kPadN = false;
+    static constexpr bool kPadK = false;
+
+    static constexpr bool TransposeC            = false;
+    static constexpr bool UseStructuredSparsity = false;
+
+    static constexpr int kBlockPerCu                = 2;
+    static constexpr int TileParitionerGroupNum     = 8;
+    static constexpr int TileParitionerM01          = 4;
+    static constexpr auto Scheduler                 = ck_tile::GemmPipelineScheduler::Default;
+    static constexpr ck_tile::index_t NumWaveGroups = 1;
+    static constexpr bool DoubleSmemBuffer          = false;
+};
+
+template <typename DataType>
+struct FlatmmConfig32_950 : public FlatmmConfig32<DataType>
+{
+    static constexpr ck_tile::index_t K_Warp_Tile = sizeof(DataType) == 2 ? 16 : 64;
+};
+
+// GEMM config with 16x16 warp tile
+template <typename DataType>
+struct FlatmmConfig16
+{
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 128;
+    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(DataType);
+
+    static constexpr ck_tile::index_t M_Warp = 1;
+    static constexpr ck_tile::index_t N_Warp = 4;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 16;
+    static constexpr ck_tile::index_t N_Warp_Tile = 16;
+    static constexpr ck_tile::index_t K_Warp_Tile = sizeof(DataType) == 2 ? 32 : 64;
+
+    static constexpr bool kPadM = false;
+    static constexpr bool kPadN = false;
+    static constexpr bool kPadK = false;
+
+    static constexpr bool TransposeC            = false;
+    static constexpr bool UseStructuredSparsity = false;
+
+    static constexpr int kBlockPerCu                = 2;
+    static constexpr int TileParitionerGroupNum     = 8;
+    static constexpr int TileParitionerM01          = 4;
+    static constexpr auto Scheduler                 = ck_tile::GemmPipelineScheduler::Default;
+    static constexpr ck_tile::index_t NumWaveGroups = 1;
+    static constexpr bool DoubleSmemBuffer          = false;
+};
+
+template <typename DataType>
+struct FlatmmConfig16_950 : public FlatmmConfig16<DataType>
+{
+    static constexpr ck_tile::index_t K_Warp_Tile = sizeof(DataType) == 2 ? 32 : 128;
+};
+
+template <typename ADataType>
+struct GemmBasicTypeConfig;
+
+template <>
+struct GemmBasicTypeConfig<ck_tile::half_t>
+{
+    using ADataType   = ck_tile::half_t;
+    using BDataType   = ck_tile::half_t;
+    using AccDataType = float;
+    using CDataType   = ck_tile::half_t;
+    // ToDo: Add more bias config to support different categories of GEMM.
+};
+
+template <>
+struct GemmBasicTypeConfig<ck_tile::bf16_t>
+{
+    using ADataType   = ck_tile::bf16_t;
+    using BDataType   = ck_tile::bf16_t;
+    using AccDataType = float;
+    using CDataType   = ck_tile::bf16_t;
+};
+template <>
+struct GemmBasicTypeConfig<ck_tile::fp8_t>
+{
+    using ADataType   = ck_tile::fp8_t;
+    using BDataType   = ck_tile::fp8_t;
+    using AccDataType = float;
+    using CDataType   = ck_tile::half_t;
+    // ToDo: Add more bias config to support different categories of GEMM.
+};
+
+template <>
+struct GemmBasicTypeConfig<ck_tile::bf8_t>
+{
+    using ADataType   = ck_tile::bf8_t;
+    using BDataType   = ck_tile::bf8_t;
+    using AccDataType = float;
+    using CDataType   = ck_tile::half_t;
+};
+
+template <typename T>
+struct DataTypeTraits;
+
+template <>
+struct DataTypeTraits<ck_tile::fp8_t>
+{
+    static constexpr const char* name = "fp8";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::bf8_t>
+{
+    static constexpr const char* name = "bf8";
+};
+template <>
+struct DataTypeTraits<float>
+{
+    static constexpr const char* name = "fp32";
+};
+
+template <>
+struct DataTypeTraits<double>
+{
+    static constexpr const char* name = "fp64";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::half_t>
+{
+    static constexpr const char* name = "fp16";
+};
+
+template <typename T>
+struct is_8bit_type
+    : std::bool_constant<std::is_same_v<T, ck_tile::fp8_t> || std::is_same_v<T, ck_tile::bf8_t>>
+{
+};
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("m", "256", "m dimension")
+        .insert("n", "256", "n dimension")
+        .insert("k", "128", "k dimension")
+        .insert("a_layout", "R", "A tensor data layout - Row by default")
+        .insert("b_layout", "C", "B tensor data layout - Row by default")
+        .insert("c_layout", "R", "C tensor data layout - Row by default")
+        .insert("stride_a", "0", "Tensor A stride")
+        .insert("stride_b", "0", "Tensor B stride")
+        .insert("stride_c", "0", "Tensor C stride")
+        .insert("v", "1", "0. No validation, 1. Validation on CPU, 2. Validation on GPU")
+        .insert("prec", "fp16", "data type. fp16/bf16/fp8/bf8")
+        .insert("warmup", "50", "number of iterations before benchmark the kernel")
+        .insert("repeat", "100", "number of iterations to benchmark the kernel")
+        .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
+        .insert("split_k", "1", "splitK value")
+        .insert("init", "0", "0:random, 1:linear, 2:constant(1)")
+        .insert("warp_tile",
+                "0",
+                "0: 16x16, 1: 32x32, 2: 16x16x128 (950 only), 3: 32x32x64 (950 only)");
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+// host API
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename FlatmmConfig,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout>
+float flatmm_calc(const ck_tile::FlatmmHostArgs<>& args, const ck_tile::stream_config& s);
--- a/example/ck_tile/18_flatmm/run_flatmm_example.inc
+++ b/example/ck_tile/18_flatmm/run_flatmm_example.inc
@@ -0,0 +1,342 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <type_traits>
+
+template <typename T>
+constexpr const char* DataTypeToString()
+{
+    if constexpr(std::is_same_v<T, ck_tile::half_t>)
+    {
+        return "fp16";
+    }
+    else if constexpr(std::is_same_v<T, ck_tile::fp8_t>)
+    {
+        return "fp8";
+    }
+    else if constexpr(std::is_same_v<T, ck_tile::bf8_t>)
+    {
+        return "bf8";
+    }
+    else if constexpr(std::is_same_v<T, ck_tile::bf16_t>)
+    {
+        return "bf16";
+    }
+    else
+    {
+        return "unknown";
+    }
+}
+
+template <typename Layout>
+static constexpr inline auto is_row_major(Layout layout_)
+{
+    return ck_tile::bool_constant<std::is_same_v<ck_tile::remove_cvref_t<decltype(layout_)>,
+                                                 ck_tile::tensor_layout::gemm::RowMajor>>{};
+}
+
+// mfma_type, 0:32x32, 1:16x16
+template <typename FlatmmConfig, typename T>
+auto shuffle_b(const ck_tile::HostTensor<T>& t)
+{
+    assert(t.get_lengths().size() == 2);
+    int n_                = t.get_lengths()[1];
+    int k_                = t.get_lengths()[0];
+    constexpr int divisor = FlatmmConfig::N_Warp_Tile == 32 ? 2 : 4;
+    ck_tile::HostTensor<T> t_view({n_ / FlatmmConfig::N_Warp_Tile,
+                                   FlatmmConfig::N_Warp_Tile,
+                                   k_ / FlatmmConfig::K_Warp_Tile,
+                                   divisor,
+                                   FlatmmConfig::K_Warp_Tile / divisor});
+    std::copy(t.begin(), t.end(), t_view.begin());
+    return ck_tile::reference_permute(t_view, {0, 2, 3, 1, 4});
+}
+
+template <typename ADataType, typename BDataType, typename AccDataType, typename CDataType>
+auto calculate_rtol_atol(const ck_tile::index_t K,
+                         const ck_tile::index_t kbatch,
+                         const float max_accumulated_value)
+{
+    using ComputeType =
+        std::conditional_t<sizeof(ADataType) < sizeof(BDataType), ADataType, BDataType>;
+    // Calculate thresholds
+    const auto rtol = ck_tile::get_relative_threshold<ComputeType, CDataType, AccDataType>(
+        ck_tile::integer_divide_ceil(K, kbatch));
+    const auto atol = ck_tile::get_absolute_threshold<ComputeType, CDataType, AccDataType>(
+        max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch));
+    // Calculate error due to split_k accumulation
+    const auto rtol_split_k =
+        ck_tile::get_relative_threshold<CDataType, CDataType, CDataType>(kbatch);
+    const auto atol_split_k = ck_tile::get_absolute_threshold<CDataType, CDataType, CDataType>(
+        max_accumulated_value, kbatch);
+    // Use higher threshold
+    return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
+}
+
+template <typename FlatmmConfig,
+          typename ADataType,
+          typename BDataType,
+          typename DsDatatype,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          bool persistent,
+          typename CDEElementWise>
+float flatmm_calc(const ck_tile::FlatmmHostArgs<>& args, const ck_tile::stream_config& s);
+
+template <typename FlatmmConfig,
+          typename ADataType,
+          typename BDataType,
+          typename DsDatatype,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename CLayout,
+          typename CDEElementWise = ck_tile::element_wise::PassThrough>
+float invoke_flatmm(ck_tile::DeviceMem& a_dev_buf,
+                    ck_tile::DeviceMem& b_shuffle_dev_buf,
+                    ck_tile::DeviceMem& c_dev_buf,
+                    ck_tile::index_t M,
+                    ck_tile::index_t N,
+                    ck_tile::index_t K,
+                    ck_tile::index_t stride_A,
+                    ck_tile::index_t stride_B,
+                    ck_tile::index_t stride_C,
+                    ck_tile::index_t kbatch,
+                    int n_warmup,
+                    int n_repeat)
+{
+    ck_tile::FlatmmHostArgs<> args = {a_dev_buf.GetDeviceBuffer(),
+                                      b_shuffle_dev_buf.GetDeviceBuffer(),
+                                      {},
+                                      c_dev_buf.GetDeviceBuffer(),
+                                      kbatch,
+                                      M,
+                                      N,
+                                      K,
+                                      stride_A,
+                                      stride_B,
+                                      {},
+                                      stride_C};
+
+    float ave_time = flatmm_calc<FlatmmConfig,
+                                 ADataType,
+                                 BDataType,
+                                 DsDatatype,
+                                 AccDataType,
+                                 CDataType,
+                                 ALayout,
+                                 BLayout,
+                                 DsLayout,
+                                 CLayout,
+                                 false,
+                                 CDEElementWise>(
+        args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat, true, true, 50});
+
+    std::size_t flop = std::size_t(2) * M * N * K;
+    std::size_t num_byte =
+        sizeof(ADataType) * M * K + sizeof(BDataType) * N * K + sizeof(CDataType) * M * N;
+    float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec = num_byte / 1.E6 / ave_time;
+
+    std::cout << "Run Flatmm kernel with DataType = " << DataTypeToString<ADataType>()
+              << " M =" << M << " N =" << N << " K =" << K << " StrideA =" << stride_A
+              << " StrideB =" << stride_B << " StrideC =" << stride_C << " : " << ave_time
+              << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, " << std::endl;
+
+    return ave_time;
+}
+
+template <typename PrecType,
+          typename FlatmmConfig,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout>
+int run_flatmm_example_with_layouts(int argc,
+                                    char* argv[],
+                                    const ALayout a_layout                  = ALayout{},
+                                    const BLayout b_layout                  = BLayout{},
+                                    [[maybe_unused]] const CLayout c_layout = CLayout{})
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    using ADataType   = typename GemmBasicTypeConfig<PrecType>::ADataType;
+    using BDataType   = typename GemmBasicTypeConfig<PrecType>::BDataType;
+    using CDataType   = typename GemmBasicTypeConfig<PrecType>::CDataType;
+    using AccDataType = typename GemmBasicTypeConfig<PrecType>::AccDataType;
+
+    ck_tile::index_t M = arg_parser.get_int("m");
+    ck_tile::index_t N = arg_parser.get_int("n");
+    ck_tile::index_t K = arg_parser.get_int("k");
+
+    ck_tile::index_t stride_A = arg_parser.get_int("stride_a");
+    ck_tile::index_t stride_B = arg_parser.get_int("stride_b");
+    ck_tile::index_t stride_C = arg_parser.get_int("stride_c");
+
+    ck_tile::index_t kbatch      = arg_parser.get_int("split_k");
+    int n_warmup                 = arg_parser.get_int("warmup");
+    int n_repeat                 = arg_parser.get_int("repeat");
+    ck_tile::index_t init_method = arg_parser.get_int("init");
+    // persistent not added
+
+    stride_A = ck_tile::get_default_stride(M, K, stride_A, is_row_major(a_layout));
+    stride_B = ck_tile::get_default_stride(K, N, stride_B, is_row_major(b_layout));
+    stride_C = ck_tile::get_default_stride(M, N, stride_C, is_row_major(CLayout{}));
+
+    ck_tile::HostTensor<ADataType> a_host(
+        ck_tile::host_tensor_descriptor(M, K, stride_A, is_row_major(a_layout)));
+    ck_tile::HostTensor<BDataType> b_origin_host(
+        ck_tile::host_tensor_descriptor(K, N, stride_B, is_row_major(b_layout)));
+    ck_tile::HostTensor<CDataType> c_rslt_host(
+        ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
+
+    // TODO: add different init types
+    if(init_method == 0)
+    {
+        ck_tile::FillUniformDistribution<ADataType>{-.5f, .5f}(a_host);
+        ck_tile::FillUniformDistribution<BDataType>{-.5f, .5f}(b_origin_host);
+    }
+    else if(init_method == 1)
+    {
+        ck_tile::FillMonotonicSeq<ADataType>{}(a_host);
+        ck_tile::FillMonotonicSeq<BDataType>{}(b_origin_host);
+    }
+    else if(init_method == 2)
+    {
+        ck_tile::FillUniformDistribution<ADataType>{1.f, 1.f}(a_host);
+        ck_tile::FillUniformDistribution<BDataType>{1.f, 1.f}(b_origin_host);
+    }
+    else
+    {
+        a_host.SetZero();
+        b_origin_host.SetZero();
+    }
+
+    ck_tile::DeviceMem a_dev_buf(a_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem c_dev_buf(c_rslt_host.get_element_space_size_in_bytes());
+
+    a_dev_buf.ToDevice(a_host.data());
+    c_rslt_host.SetZero();
+
+    // do pre-shuffle
+    ck_tile::HostTensor<BDataType> b_shuffle_host = shuffle_b<FlatmmConfig>(b_origin_host);
+    ck_tile::DeviceMem b_shuffle_dev_buf(b_shuffle_host.get_element_space_size_in_bytes());
+    b_shuffle_dev_buf.ToDevice(b_shuffle_host.data());
+
+    invoke_flatmm<FlatmmConfig,
+                  ADataType,
+                  BDataType,
+                  ck_tile::tuple<>,
+                  AccDataType,
+                  CDataType,
+                  ALayout,
+                  BLayout,
+                  ck_tile::tuple<>,
+                  CLayout>(a_dev_buf,
+                           b_shuffle_dev_buf,
+                           c_dev_buf,
+                           M,
+                           N,
+                           K,
+                           stride_A,
+                           stride_B,
+                           stride_C,
+                           kbatch,
+                           n_warmup,
+                           n_repeat);
+
+    c_dev_buf.FromDevice(c_rslt_host.data());
+    bool pass = true;
+
+    if(arg_parser.get_int("v") == 1)
+    {
+        ck_tile::HostTensor<CDataType> c_ref_host(
+            ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
+        c_ref_host.SetZero();
+
+        ck_tile::reference_gemm<ADataType, BDataType, AccDataType, CDataType>(
+            a_host, b_origin_host, c_ref_host);
+        const float max_accumulated_value =
+            *std::max_element(c_ref_host.mData.begin(), c_ref_host.mData.end());
+        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
+            K, kbatch, max_accumulated_value);
+        pass = ck_tile::check_err(c_rslt_host,
+                                  c_ref_host,
+                                  "Error: Incorrect results!",
+                                  rtol_atol.at(ck_tile::number<0>{}),
+                                  rtol_atol.at(ck_tile::number<1>{}));
+
+        std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
+                  << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
+                  << std::endl;
+        std::cout << "The CPU veification result is:" << (pass ? "correct" : "fail") << std::endl;
+    }
+    else if(arg_parser.get_int("v") == 2)
+    {
+        ck_tile::DeviceMem b_origin_dev_buf(b_origin_host.get_element_space_size_in_bytes());
+        b_origin_dev_buf.ToDevice(b_origin_host.data());
+
+        ck_tile::HostTensor<CDataType> c_gpu_ref_host(
+            ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
+        ck_tile::DeviceMem c_gpu_ref_dev_buf(c_gpu_ref_host.get_element_space_size_in_bytes());
+        c_gpu_ref_host.SetZero();
+        c_gpu_ref_dev_buf.SetZero();
+
+        ADataType* d_A;
+        BDataType* d_B;
+        CDataType* d_C;
+
+        ck_tile::hip_check_error(hipMalloc(&d_A, M * K * sizeof(ADataType)));
+        ck_tile::hip_check_error(hipMalloc(&d_B, N * K * sizeof(BDataType)));
+        ck_tile::hip_check_error(hipMalloc(&d_C, M * N * sizeof(CDataType)));
+
+        ck_tile::hip_check_error(hipMemcpy(
+            d_A, a_dev_buf.GetDeviceBuffer(), M * K * sizeof(ADataType), hipMemcpyHostToDevice));
+        ck_tile::hip_check_error(hipMemcpy(d_B,
+                                           b_origin_dev_buf.GetDeviceBuffer(),
+                                           N * K * sizeof(BDataType),
+                                           hipMemcpyHostToDevice));
+
+        ck_tile::reference_gemm_gpu<ADataType,
+                                    BDataType,
+                                    AccDataType,
+                                    CDataType,
+                                    ALayout,
+                                    BLayout,
+                                    CLayout>(d_A, d_B, d_C, M, N, K, stride_A, stride_B, stride_C);
+
+        ck_tile::hip_check_error(hipMemcpy(c_gpu_ref_dev_buf.GetDeviceBuffer(),
+                                           d_C,
+                                           M * N * sizeof(CDataType),
+                                           hipMemcpyDeviceToHost));
+
+        ck_tile::hip_check_error(hipFree(d_A));
+        ck_tile::hip_check_error(hipFree(d_B));
+        ck_tile::hip_check_error(hipFree(d_C));
+
+        c_gpu_ref_dev_buf.FromDevice(c_gpu_ref_host.data());
+        const float max_accumulated_value =
+            *std::max_element(c_gpu_ref_host.mData.begin(), c_gpu_ref_host.mData.end());
+        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
+            K, kbatch, max_accumulated_value);
+        pass = ck_tile::check_err(c_rslt_host,
+                                  c_gpu_ref_host,
+                                  "Error: Incorrect results!",
+                                  rtol_atol.at(ck_tile::number<0>{}),
+                                  rtol_atol.at(ck_tile::number<1>{}));
+
+        std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
+                  << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
+                  << std::endl;
+        std::cout << "The GPU veification result is: " << (pass ? "correct" : "fail") << std::endl;
+    }
+
+    return pass;
+}
--- a/example/ck_tile/18_flatmm/script/smoke_test_basic.sh
+++ b/example/ck_tile/18_flatmm/script/smoke_test_basic.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+# Copyright © Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier:  MIT
+
+EXE="$(find . -name tile_example_flatmm_basic -type f | head -n 1)"
+KNAME=1
+
+export CK_WARMUP=0
+export CK_REPEAT=1
+
+COMMON_ARGS='-v=2 -warmup=0 -repeat=1'
+
+run_tests() {
+    for m in 128 1024; do
+        for n in 128 2048; do
+            for k in 128 4096; do
+
+                $EXE -m=$m -n=$n -k=$k -stride_a=0 -stride_b=0 -stride_c=0 -prec=$1 $COMMON_ARGS
+                if [ $? -eq 0 ]; then
+                    echo "Success: Test with m=$m, n=$n, k=$k executed successfully."
+                else
+                    echo "Error: Test with m=$m, n=$n, k=$k failed to execute properly."
+                    # Optionally, exit or break if you need to halt further execution
+                    # exit 1
+                fi
+
+            done
+        done
+    done
+}
+
+set -x
+
+run_tests "bf16"
+run_tests "fp16"
+
+set +x
--- a/example/ck_tile/19_gemm_multi_d/CMakeLists.txt
+++ b/example/ck_tile/19_gemm_multi_d/CMakeLists.txt
@@ -0,0 +1,6 @@
+add_executable(tile_example_gemm_multi_d_fp16 EXCLUDE_FROM_ALL gemm_multi_d_fp16.cpp)
+set(EXAMPLE_GEMM_COMPILE_OPTIONS)
+if(CK_USE_OCP_FP8)
+    list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
+endif()
+target_compile_options(tile_example_gemm_multi_d_fp16 PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
--- a/example/ck_tile/19_gemm_multi_d/README.md
+++ b/example/ck_tile/19_gemm_multi_d/README.md
@@ -0,0 +1,35 @@
+#Multiple D GEMM
+
+This folder contains example for Multiple D GEMM using ck_tile tile-programming implementation.
+
+## build
+```
+#in the root of ck_tile
+mkdir build && cd build
+#you can replace < arch> with the appropriate architecture(for example gfx90a or gfx942) or \
+    leave it blank
+sh ../script/cmake-ck-dev.sh  ../ <arch>
+#The basic pipeline method on the gemm calculation
+make tile_example_gemm_multi_d_fp16 -j
+```
+This will result in an executable `build/bin/tile_example_gemm_multi_d_fp16`
+
+## example
+```
+args:
+       -m  M dimensions - (Default: 3840)
+       -n  N dimensions - (Default: 4096)
+       -k  K dimensions - (Default: 4096)
+-a_layout  Tensor A layout (default:R)
+-b_layout  Tensor B layout (default:C)
+-ds_layout Tensor D layout (default:R)
+-e_layout  Tensor E layout (default:R)
+-stride_a  Tensor A strides - (Default: 0)
+-stride_b  Tensor B strides - (Default: 0)
+-stride_e  Tensor C strides - (Default: 0)
+-stride_ds Tensor D strides - (Default: 0)
+-validate  0. No validation, 1. Validation on GPU. (Default: 1)
+  -warmup  Number of iterations before benchmark the kernel. (Default: 10)
+  -repeat  Number of iterations to benchmark the kernel. (Default: 100)
+  -kbatch  kbatch for SplitK. (Default 1)
+```
--- a/example/ck_tile/19_gemm_multi_d/gemm_multi_d_fp16.cpp
+++ b/example/ck_tile/19_gemm_multi_d/gemm_multi_d_fp16.cpp
@@ -0,0 +1,207 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <hip/hip_runtime.h>
+
+#include <cstring>
+#include <iostream>
+#include <ostream>
+#include <string>
+#include <tuple>
+#include <memory>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/epilogue.hpp"
+#include "ck_tile/ops/gemm.hpp"
+#include "ck_tile/host.hpp"
+#include "gemm_multi_d_fp16.hpp"
+#include "utils.hpp"
+
+template <typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename AccDataType,
+          typename EDataType,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename CLayout,
+          typename CDEElementWise = ck_tile::element_wise::PassThrough>
+auto gemm_multi_d(const gemm_multi_d_kargs& args, const ck_tile::stream_config& s) -> float
+{
+#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
+    // Memory friendly for Interwave scheduler
+    constexpr ck_tile::index_t M_Tile = 128;
+    constexpr ck_tile::index_t N_Tile = 32;
+    constexpr ck_tile::index_t K_Tile = 64;
+
+    constexpr ck_tile::index_t M_Warp = 4;
+    constexpr ck_tile::index_t N_Warp = 1;
+    constexpr ck_tile::index_t K_Warp = 1;
+
+    constexpr ck_tile::index_t M_Warp_Tile = 32;
+    constexpr ck_tile::index_t N_Warp_Tile = 32;
+    constexpr ck_tile::index_t K_Warp_Tile = 8;
+
+    constexpr bool DoubleSmemBuffer = false;
+#endif
+#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3)
+    // Compute friendly for Intrawave scheduler
+    constexpr ck_tile::index_t M_Tile = 256;
+    constexpr ck_tile::index_t N_Tile = 256;
+    constexpr ck_tile::index_t K_Tile = 64;
+
+    constexpr ck_tile::index_t M_Warp = 2;
+    constexpr ck_tile::index_t N_Warp = 2;
+    constexpr ck_tile::index_t K_Warp = 1;
+
+    constexpr ck_tile::index_t M_Warp_Tile = 32;
+    constexpr ck_tile::index_t N_Warp_Tile = 32;
+    constexpr ck_tile::index_t K_Warp_Tile = 16;
+
+    constexpr bool DoubleSmemBuffer = false;
+#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V4)
+    // Compute friendly for Intrawave scheduler
+    // Using the ping pong reader in the lds level
+    constexpr ck_tile::index_t M_Tile = 256;
+    constexpr ck_tile::index_t N_Tile = 256;
+    constexpr ck_tile::index_t K_Tile = 32;
+
+    constexpr ck_tile::index_t M_Warp = 2;
+    constexpr ck_tile::index_t N_Warp = 2;
+    constexpr ck_tile::index_t K_Warp = 1;
+
+    constexpr ck_tile::index_t M_Warp_Tile = 32;
+    constexpr ck_tile::index_t N_Warp_Tile = 32;
+    constexpr ck_tile::index_t K_Warp_Tile = 16;
+
+    constexpr bool DoubleSmemBuffer = true;
+#endif
+
+    constexpr bool kPadM = false;
+    constexpr bool kPadN = false;
+    constexpr bool kPadK = false;
+
+    constexpr bool TransposeC = false;
+
+    constexpr int kBlockPerCu                         = 1;
+    constexpr ck_tile::index_t TileParitionerGroupNum = 8;
+    constexpr ck_tile::index_t TileParitionerM01      = 4;
+
+    using GemmShape =
+        ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
+                               ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
+                               ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
+
+    using TilePartitioner = ck_tile::
+        GemmSpatiallyLocalTilePartitioner<GemmShape, TileParitionerGroupNum, TileParitionerM01>;
+
+    using Traits = ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
+
+    using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<kPadM,
+                                                                 kPadN,
+                                                                 kPadK,
+                                                                 DoubleSmemBuffer,
+                                                                 ALayout,
+                                                                 BLayout,
+                                                                 CLayout,
+                                                                 TransposeC>;
+    using GemmPipelineProblem =
+        ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
+
+    using BaseGemmPipeline = UNIVERSAL_GEMM_PIPELINE<GemmPipelineProblem>;
+
+    const ck_tile::index_t k_grain     = args.k_batch * K_Tile;
+    const ck_tile::index_t K_split     = (args.K + k_grain - 1) / k_grain * K_Tile;
+    const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(K_split);
+    const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
+    const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
+
+    float ave_time{0};
+
+    const auto Run =
+        [&](const auto has_hot_loop_, const auto tail_number_, const auto memory_operation_) {
+            constexpr bool has_hot_loop_v   = has_hot_loop_.value;
+            constexpr auto tail_number_v    = tail_number_.value;
+            constexpr auto scheduler        = GEMM_PIPELINE_SCHEDULER;
+            constexpr auto memory_operation = memory_operation_.value;
+
+            using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                                               BDataType,
+                                                                               AccDataType,
+                                                                               GemmShape,
+                                                                               GemmUniversalTraits,
+                                                                               scheduler,
+                                                                               has_hot_loop_v,
+                                                                               tail_number_v>;
+
+            using GemmPipeline = GEMM_PIPELINE<UniversalGemmProblem>;
+
+            using GemmEpilogue = ck_tile::CShuffleEpilogue<
+                ck_tile::CShuffleEpilogueProblem<ADataType,
+                                                 BDataType,
+                                                 DsDataType,
+                                                 AccDataType,
+                                                 EDataType,
+                                                 DsLayout,
+                                                 CLayout,
+                                                 CDEElementWise,
+                                                 GemmPipelineProblem::kBlockSize,
+                                                 TilePartitioner::MPerBlock,
+                                                 TilePartitioner::NPerBlock,
+                                                 M_Warp,
+                                                 N_Warp,
+                                                 M_Warp_Tile,
+                                                 N_Warp_Tile,
+                                                 K_Warp_Tile,
+                                                 UniversalGemmProblem::TransposeC,
+                                                 memory_operation>>;
+
+            using Kernel = ck_tile::GemmKernelMultiD<TilePartitioner, GemmPipeline, GemmEpilogue>;
+            auto kargs   = Kernel::MakeKernelArgs(args);
+
+            const dim3 grids      = Kernel::GridSize(args.M, args.N, args.k_batch);
+            constexpr dim3 blocks = Kernel::BlockSize();
+
+            if(!Kernel::IsSupportedArgument(kargs))
+            {
+                throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
+            }
+
+            if(s.log_level_ > 0)
+            {
+                std::cout << "Launching kernel with args:" << " grid: {" << grids.x << ", "
+                          << grids.y << ", " << grids.z << "}" << ", blocks: {" << blocks.x << ", "
+                          << blocks.y << ", " << blocks.z << "}" << std::endl;
+            }
+
+            ave_time = ck_tile::launch_kernel(
+                s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+            return ave_time;
+        };
+
+    const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
+        if(args.k_batch == 1)
+        {
+            Run(has_hot_loop_,
+                tail_number_,
+                ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                           ck_tile::memory_operation_enum::set>{});
+        }
+        else
+        {
+            Run(has_hot_loop_,
+                tail_number_,
+                ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                           ck_tile::memory_operation_enum::atomic_add>{});
+        }
+    };
+
+    BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);
+
+    return ave_time;
+}
+
+#include "run_gemm_multi_d_fp16_example.inc"
+
+int main(int argc, char* argv[]) { return !run_multiple_d_gemm_example(argc, argv); }
--- a/example/ck_tile/19_gemm_multi_d/gemm_multi_d_fp16.hpp
+++ b/example/ck_tile/19_gemm_multi_d/gemm_multi_d_fp16.hpp
@@ -0,0 +1,79 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <string>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp"
+
+#define CK_TILE_PIPELINE_COMPUTE_V3 1
+#define CK_TILE_PIPELINE_MEMORY 2
+#define CK_TILE_PIPELINE_COMPUTE_V4 3
+
+#ifndef CK_TILE_PIPELINE_DEFAULT
+#define CK_TILE_PIPELINE_DEFAULT CK_TILE_PIPELINE_COMPUTE_V3
+#endif
+
+#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_MEMORY)
+#define GEMM_PIPELINE ck_tile::GemmPipelineAgBgCrMem
+#define UNIVERSAL_GEMM_PIPELINE ck_tile::BaseGemmPipelineAgBgCrMem
+#define GEMM_PIPELINE_SCHEDULER ck_tile::GemmPipelineScheduler::Interwave
+#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3)
+#define GEMM_PIPELINE ck_tile::GemmPipelineAgBgCrCompV3
+#define UNIVERSAL_GEMM_PIPELINE ck_tile::BaseGemmPipelineAgBgCrCompV3
+#define GEMM_PIPELINE_SCHEDULER ck_tile::GemmPipelineScheduler::Intrawave
+#elif(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V4)
+#define GEMM_PIPELINE ck_tile::GemmPipelineAgBgCrCompV4
+#define UNIVERSAL_GEMM_PIPELINE ck_tile::BaseGemmPipelineAgBgCrCompV4
+#define GEMM_PIPELINE_SCHEDULER ck_tile::GemmPipelineScheduler::Intrawave
+#else
+#error "unsupported CK_TILE_PIPELINE_DEFAULT value"
+#endif
+
+using ADataType   = ck_tile::half_t;
+using BDataType   = ck_tile::half_t;
+using D0DataType  = ck_tile::half_t;
+using D1DataType  = ck_tile::half_t;
+using EDataType   = ck_tile::half_t;
+using DsDataType  = ck_tile::tuple<D0DataType, D1DataType>;
+using AccDataType = float;
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("m", "3840", "m dimension")
+        .insert("n", "4096", "n dimension")
+        .insert("k", "4096", "k dimension")
+        .insert("a_layout", "R", "A tensor data layout - Row by default")
+        .insert("b_layout", "C", "B tensor data layout - Col by default")
+        .insert("ds_layout", "R", "Ds tensor data layout - Row by default")
+        .insert("e_layout", "R", "E tensor data layout - Row by default")
+        .insert("stride_a", "0", "Tensor A stride")
+        .insert("stride_b", "0", "Tensor B stride")
+        .insert("stride_ds", "0", "Tensor Ds stride")
+        .insert("stride_e", "0", "Tensor E stride")
+        .insert("v", "1", "0. No validation, 1. Validation on GPU")
+        .insert("warmup", "50", "number of iterations before benchmark the kernel")
+        .insert("repeat", "100", "number of iterations to benchmark the kernel")
+        .insert("kbatch", "1", "kbatch for SplitK");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+using gemm_multi_d_kargs = ck_tile::GemmMultiDHostArgs<DsDataType::size()>;
+
+template <typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename AccDataType,
+          typename EDataType,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename CLayout,
+          typename CDEElementWise>
+float gemm_multi_d(const gemm_multi_d_kargs& kargs, const ck_tile::stream_config& s);
--- a/example/ck_tile/19_gemm_multi_d/run_gemm_multi_d_fp16_example.inc
+++ b/example/ck_tile/19_gemm_multi_d/run_gemm_multi_d_fp16_example.inc
@@ -0,0 +1,247 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+#include <cstddef>
+
+template <typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename AccDataType,
+          typename EDataType,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename CDEElementWise = ck_tile::element_wise::PassThrough>
+float invoke_gemm_multi_d(const void* a_m_k_dev_buf,
+                          const void* b_k_n_dev_buf,
+                          const std::array<const void*, DsDataType::size()>& ds_m_n_dev_buf,
+                          void* e_m_n_dev_buf,
+                          ck_tile::index_t M,
+                          ck_tile::index_t N,
+                          ck_tile::index_t K,
+                          ck_tile::index_t StrideA,
+                          ck_tile::index_t StrideB,
+                          const std::array<ck_tile::index_t, DsDataType::size()>& StrideDs,
+                          ck_tile::index_t StrideE,
+                          int n_warmup,
+                          int n_repeat,
+                          int k_batch)
+{
+    gemm_multi_d_kargs gemm_descs({a_m_k_dev_buf,
+                                   b_k_n_dev_buf,
+                                   ds_m_n_dev_buf,
+                                   e_m_n_dev_buf,
+                                   k_batch,
+                                   M,
+                                   N,
+                                   K,
+                                   StrideA,
+                                   StrideB,
+                                   StrideDs,
+                                   StrideE});
+
+    float ave_time = gemm_multi_d<ADataType,
+                                  BDataType,
+                                  DsDataType,
+                                  AccDataType,
+                                  EDataType,
+                                  ALayout,
+                                  BLayout,
+                                  DsLayout,
+                                  ELayout,
+                                  CDEElementWise>(
+        gemm_descs, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});
+
+    std::string op_name{"Gemm Multiple-D"};
+    static constexpr ck_tile::index_t NumDTensor = DsDataType::size();
+
+    std::size_t flop = 0, num_btype = 0;
+
+    flop += std::size_t(2) * M * N * K;
+
+    ck_tile::static_for<0, NumDTensor, 1>{}([&](auto i) {
+        num_btype += sizeof(ck_tile::remove_cvref_t<std::tuple_element_t<i, DsDataType>>) * M * N;
+        flop += sizeof(ck_tile::remove_cvref_t<std::tuple_element_t<i, DsDataType>>) * M * N;
+    });
+
+    num_btype += sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
+
+    float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Run Gemm Multiple-D kernel with:\n";
+    std::cout << "M =" << M << " N =" << N << " K =" << K << "\n";
+    std::cout << "StrideA = " << StrideA << " StrideB = " << StrideB << " StrideE = " << StrideE
+              << "\n";
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << "\n";
+
+    return ave_time;
+}
+
+template <typename ALayout,
+          typename BLayout,
+          typename D0Layout,
+          typename D1Layout,
+          typename ELayout>
+int run_multiple_d_gemm_example_with_layouts(int argc,
+                                             char* argv[],
+                                             const ALayout a_layout   = ALayout{},
+                                             const BLayout b_layout   = BLayout{},
+                                             const D0Layout d0_layout = D0Layout{},
+                                             const D1Layout d1_layout = D1Layout{},
+                                             const ELayout e_layout   = ELayout{})
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+    {
+        return -1;
+    }
+    using CDElementWiseFn = MultiplyMultiply;
+    using DsLayout        = ck_tile::tuple<D0Layout, D1Layout>;
+
+    ck_tile::index_t M = arg_parser.get_int("m");
+    ck_tile::index_t N = arg_parser.get_int("n");
+    ck_tile::index_t K = arg_parser.get_int("k");
+
+    ck_tile::index_t StrideA = arg_parser.get_int("stride_a");
+    ck_tile::index_t StrideB = arg_parser.get_int("stride_b");
+    ck_tile::index_t StrideD = arg_parser.get_int("stride_ds");
+    ck_tile::index_t StrideE = arg_parser.get_int("stride_e");
+
+    ck_tile::index_t StrideD0 = StrideD;
+    ck_tile::index_t StrideD1 = StrideD;
+
+    const int n_warmup = arg_parser.get_int("warmup");
+    const int n_repeat = arg_parser.get_int("repeat");
+    const int k_batch  = arg_parser.get_int("kbatch");
+
+    StrideA  = get_default_stride(M, K, StrideA, is_row_major(a_layout));
+    StrideB  = get_default_stride(K, N, StrideB, is_row_major(b_layout));
+    StrideD0 = get_default_stride(M, N, StrideD0, is_row_major(d0_layout));
+    StrideD1 = get_default_stride(M, N, StrideD1, is_row_major(d1_layout));
+    StrideE  = get_default_stride(M, N, StrideE, is_row_major(e_layout));
+
+    ck_tile::HostTensor<ADataType> a_m_k_tesnor(
+        host_tensor_descriptor(M, K, StrideA, is_row_major(a_layout)));
+    ck_tile::HostTensor<BDataType> b_k_n_tensors(
+        host_tensor_descriptor(K, N, StrideB, is_row_major(b_layout)));
+    ck_tile::HostTensor<D0DataType> d0_m_n_tensors(
+        host_tensor_descriptor(M, N, StrideD0, is_row_major(d0_layout)));
+    ck_tile::HostTensor<D1DataType> d1_m_n_tensors(
+        host_tensor_descriptor(M, N, StrideD1, is_row_major(d1_layout)));
+    ck_tile::HostTensor<EDataType> e_m_n_device_result(
+        host_tensor_descriptor(M, N, StrideE, is_row_major(e_layout)));
+
+    ck_tile::FillUniformDistribution<ADataType>{-5.f, 5.f}(a_m_k_tesnor);
+    ck_tile::FillUniformDistribution<BDataType>{-5.f, 5.f}(b_k_n_tensors);
+    ck_tile::FillUniformDistribution<D0DataType>{-1.f, 1.f}(d0_m_n_tensors);
+    ck_tile::FillUniformDistribution<D1DataType>{-1.f, 1.f}(d1_m_n_tensors);
+
+    ck_tile::DeviceMem a_m_k_dev_buf(a_m_k_tesnor.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem b_k_n_dev_buf(b_k_n_tensors.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem d0_m_n_dev_buf(d0_m_n_tensors.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem d1_m_n_dev_buf(d1_m_n_tensors.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem e_m_n_dev_buf(e_m_n_device_result.get_element_space_size_in_bytes());
+
+    a_m_k_dev_buf.ToDevice(a_m_k_tesnor.mData.data());
+    b_k_n_dev_buf.ToDevice(b_k_n_tensors.mData.data());
+    d0_m_n_dev_buf.ToDevice(d0_m_n_tensors.mData.data());
+    d1_m_n_dev_buf.ToDevice(d1_m_n_tensors.mData.data());
+
+    e_m_n_dev_buf.SetZero();
+    e_m_n_device_result.SetZero();
+
+    std::array<const void*, DsDataType::size()> ds_ptr_buf = {d0_m_n_dev_buf.GetDeviceBuffer(),
+                                                              d1_m_n_dev_buf.GetDeviceBuffer()};
+
+    std::array<ck_tile::index_t, DsDataType::size()> stridesDs = {StrideD0, StrideD1};
+
+    invoke_gemm_multi_d<ADataType,
+                        BDataType,
+                        DsDataType,
+                        AccDataType,
+                        EDataType,
+                        ALayout,
+                        BLayout,
+                        DsLayout,
+                        ELayout,
+                        CDElementWiseFn>(a_m_k_dev_buf.GetDeviceBuffer(),
+                                         b_k_n_dev_buf.GetDeviceBuffer(),
+                                         ds_ptr_buf,
+                                         e_m_n_dev_buf.GetDeviceBuffer(),
+                                         M,
+                                         N,
+                                         K,
+                                         StrideA,
+                                         StrideB,
+                                         stridesDs,
+                                         StrideE,
+                                         n_warmup,
+                                         n_repeat,
+                                         k_batch);
+
+    e_m_n_dev_buf.FromDevice(e_m_n_device_result.data());
+
+    ck_tile::HostTensor<EDataType> e_m_n_host_ref(
+        host_tensor_descriptor(M, N, StrideE, is_row_major(e_layout)));
+    e_m_n_host_ref.SetZero();
+
+    ck_tile::reference_gemm_multiple_d<ADataType,
+                                       BDataType,
+                                       DsDataType,
+                                       AccDataType,
+                                       EDataType,
+                                       CDElementWiseFn>(
+        a_m_k_tesnor, b_k_n_tensors, {d0_m_n_tensors, d1_m_n_tensors}, e_m_n_host_ref);
+
+    bool pass{true};
+    if(arg_parser.get_int("v"))
+    {
+        const float max_accumulated_value =
+            *std::max_element(e_m_n_host_ref.mData.begin(), e_m_n_host_ref.mData.end());
+
+        const auto rtol_atol = calculate_rtol_atol(K, 1, max_accumulated_value);
+
+        pass &= ck_tile::check_err(e_m_n_device_result,
+                                   e_m_n_host_ref,
+                                   "Error: Incorrect results!",
+                                   rtol_atol.at(ck_tile::number<0>{}),
+                                   rtol_atol.at(ck_tile::number<1>{}));
+
+        std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
+                  << std::endl;
+        std::cout << "Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
+                  << std::endl;
+        std::cout << "The CPU veification result is: " << (pass ? "correct" : "fail") << std::endl;
+    }
+    return pass;
+}
+
+int run_multiple_d_gemm_example(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+    {
+        return -1;
+    }
+
+    const std::string a_layout  = arg_parser.get_str("a_layout");
+    const std::string b_layout  = arg_parser.get_str("b_layout");
+    const std::string ds_layout = arg_parser.get_str("ds_layout");
+
+    using Row = ck_tile::tensor_layout::gemm::RowMajor;
+    using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
+
+    if(a_layout == "R" && b_layout == "C" && ds_layout == "R")
+    {
+        return run_multiple_d_gemm_example_with_layouts(
+            argc, argv, Row{}, Col{}, Row{}, Row{}, Row{});
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported data layout configuration for provided tensors!");
+    }
+}
--- a/example/ck_tile/19_gemm_multi_d/utils.hpp
+++ b/example/ck_tile/19_gemm_multi_d/utils.hpp
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+struct MultiplyMultiply
+{
+    template <typename E, typename C, typename D0, typename D1>
+    CK_TILE_HOST_DEVICE auto operator()(E& e, const C& c, const D0& d0, const D1& d1) const -> void
+    {
+        const float x0_f = ck_tile::type_convert<float>(c) * ck_tile::type_convert<float>(d0) *
+                           ck_tile::type_convert<float>(d1);
+
+        e = ck_tile::type_convert<E>(x0_f);
+    }
+};
+
+template <typename Layout>
+static constexpr inline auto is_row_major(Layout layout_)
+{
+    return ck_tile::bool_constant<std::is_same_v<ck_tile::remove_cvref_t<decltype(layout_)>,
+                                                 ck_tile::tensor_layout::gemm::RowMajor>>{};
+}
+
+auto calculate_rtol_atol(const ck_tile::index_t K,
+                         const ck_tile::index_t kbatch,
+                         const float max_accumulated_value)
+{
+    using ComputeTypeAB =
+        std::conditional_t<sizeof(ADataType) < sizeof(BDataType), ADataType, BDataType>;
+
+    using ComputeType =
+        std::conditional_t<sizeof(ComputeTypeAB) < sizeof(D0DataType), ComputeTypeAB, D0DataType>;
+    // Calculate thresholds
+    const auto rtol = ck_tile::get_relative_threshold<ComputeType, EDataType, AccDataType>(
+        ck_tile::integer_divide_ceil(K, kbatch));
+
+    const auto atol = ck_tile::get_absolute_threshold<ComputeType, EDataType, AccDataType>(
+        max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch));
+
+    // Calculate error due to split_k accumulation
+    const auto rtol_split_k =
+        ck_tile::get_relative_threshold<EDataType, EDataType, EDataType>(kbatch);
+
+    const auto atol_split_k = ck_tile::get_absolute_threshold<EDataType, EDataType, EDataType>(
+        max_accumulated_value, kbatch);
+
+    // Use higher threshold
+    return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
+}
--- a/example/ck_tile/20_grouped_convolution/CMakeLists.txt
+++ b/example/ck_tile/20_grouped_convolution/CMakeLists.txt
@@ -0,0 +1,8 @@
+set(EXAMPLE_CONV_COMPILE_OPTIONS)
+list(APPEND EXAMPLE_CONV_COMPILE_OPTIONS -mllvm -enable-noalias-to-md-conversion=0)
+
+add_executable(tile_example_grouped_conv_fwd EXCLUDE_FROM_ALL grouped_convolution_forward.cpp)
+target_compile_options(tile_example_grouped_conv_fwd PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+
+add_executable(tile_example_grouped_conv_bwd_weight EXCLUDE_FROM_ALL grouped_convolution_backward_weight.cpp)
+target_compile_options(tile_example_grouped_conv_bwd_weight PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
--- a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight.cpp
+++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight.cpp
@@ -0,0 +1,218 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <hip/hip_runtime.h>
+
+#include <cstring>
+#include <iostream>
+#include <ostream>
+#include <string>
+#include <tuple>
+
+#include "ck_tile/host.hpp"
+#include "grouped_convolution_utils.hpp"
+
+template <ck_tile::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename AccDataType,
+          typename OutDataType,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout,
+          typename DsDataType     = ck_tile::tuple<>,
+          typename DsLayout       = ck_tile::tuple<>,
+          typename CDEElementWise = ck_tile::element_wise::PassThrough>
+float grouped_conv_bwd_weight(const ck_tile::GroupedConvBwdWeightHostArgs& args,
+                              const ck_tile::stream_config& s)
+{
+    constexpr int kBlockPerCu = 1;
+
+    constexpr ck_tile::index_t M_Tile = 64;
+    constexpr ck_tile::index_t N_Tile = 64;
+    constexpr ck_tile::index_t K_Tile = 64;
+
+    constexpr ck_tile::index_t M_Warp = 2;
+    constexpr ck_tile::index_t N_Warp = 2;
+    constexpr ck_tile::index_t K_Warp = 1;
+
+    constexpr ck_tile::index_t M_Warp_Tile = 32;
+    constexpr ck_tile::index_t N_Warp_Tile = 32;
+    constexpr ck_tile::index_t K_Warp_Tile = 16;
+
+    constexpr ck_tile::index_t VectorSizeA = 8;
+    constexpr ck_tile::index_t VectorSizeB = 8;
+    constexpr ck_tile::index_t VectorSizeC = 8;
+
+    // Implicit GEMM Traits
+    using CodegenShape =
+        ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
+                               ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
+                               ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
+
+    constexpr auto ConvSpec = ck_tile::ConvolutionSpecialization::Default;
+    using TilePartitioner   = ck_tile::GemmTile1DPartitioner<CodegenShape>;
+    using GroupedConvTraitsType =
+        ck_tile::GroupedConvTraits<NDimSpatial, ConvSpec, InLayout, WeiLayout, DsLayout, OutLayout>;
+    using CodegenPipelineProblem =
+        ck_tile::GemmPipelineProblem<InDataType,
+                                     WeiDataType,
+                                     AccDataType,
+                                     CodegenShape,
+                                     typename GroupedConvTraitsType::GroupedConvImplicitGemmTraits,
+                                     InDataType,
+                                     true,
+                                     VectorSizeA,
+                                     VectorSizeB>;
+    using CodegenPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem>;
+
+    const auto Run = [&](const auto memory_operation_) {
+        constexpr auto memory_operation = memory_operation_.value;
+
+        using ConvEpilogue = ck_tile::CShuffleEpilogue<
+            ck_tile::CShuffleEpilogueProblem<InDataType,
+                                             WeiDataType,
+                                             DsDataType,
+                                             AccDataType,
+                                             OutDataType,
+                                             typename GroupedConvTraitsType::ImplicitGemmDsLayout,
+                                             ck_tile::tensor_layout::gemm::RowMajor,
+                                             CDEElementWise,
+                                             CodegenPipelineProblem::kBlockSize,
+                                             TilePartitioner::MPerBlock,
+                                             TilePartitioner::NPerBlock,
+                                             M_Warp,
+                                             N_Warp,
+                                             M_Warp_Tile,
+                                             N_Warp_Tile,
+                                             K_Warp_Tile,
+                                             CodegenPipelineProblem::TransposeC,
+                                             memory_operation,
+                                             1,
+                                             true,
+                                             VectorSizeC>>;
+
+        using Kernel = ck_tile::GroupedConvolutionBackwardWeightKernel<GroupedConvTraitsType,
+                                                                       TilePartitioner,
+                                                                       CodegenPipeline,
+                                                                       ConvEpilogue>;
+        auto kargs   = Kernel::MakeKernelArgs(args);
+
+        const dim3 grids      = Kernel::GridSize(kargs);
+        constexpr dim3 blocks = Kernel::BlockSize();
+
+        if(!Kernel::IsSupportedArgument(kargs))
+        {
+            throw std::runtime_error("Wrong! Arguments not supported! Skipping conv!\n");
+        }
+
+        if(s.log_level_ > 0)
+        {
+            std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
+                      << "shape: " << CodegenShape::GetName() << '\n'
+                      << "problem: " << CodegenPipelineProblem::GetName() << '\n'
+                      << "pipeline: " << CodegenPipeline::GetName() << '\n'
+                      << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
+                      << '\n'
+                      << "Vector size A: " << CodegenPipeline::GetVectorSizeA()
+                      << ", Vector size B: " << CodegenPipeline::GetVectorSizeB()
+                      << ", Vector size C: " << ConvEpilogue::GetVectorSizeC() << std::endl;
+        }
+
+        float ave_time = ck_tile::launch_kernel_time_mask(
+            s,
+            Kernel::Preprocess(kargs, s),
+            ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+
+        return ave_time;
+    };
+
+    if(args.k_batch == 1)
+    {
+        return Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                              ck_tile::memory_operation_enum::set>{});
+    }
+    else
+    {
+        return Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                              ck_tile::memory_operation_enum::atomic_add>{});
+    }
+}
+
+#include "run_grouped_convolution_bwd_weight_example.inc"
+
+template <typename InPrecType, typename WeiPrecType = InPrecType, typename OutPrecType = InPrecType>
+int run_grouped_conv_bwd_weight_example_prec_type(
+    std::string in_layout, std::string wei_layout, std::string out_layout, int argc, char* argv[])
+{
+    using NWGC   = ck_tile::tensor_layout::convolution::NWGC;
+    using NHWGC  = ck_tile::tensor_layout::convolution::NHWGC;
+    using NDHWGC = ck_tile::tensor_layout::convolution::NDHWGC;
+
+    using GKXC   = ck_tile::tensor_layout::convolution::GKXC;
+    using GKYXC  = ck_tile::tensor_layout::convolution::GKYXC;
+    using GKZYXC = ck_tile::tensor_layout::convolution::GKZYXC;
+
+    using NWGK   = ck_tile::tensor_layout::convolution::NWGK;
+    using NHWGK  = ck_tile::tensor_layout::convolution::NHWGK;
+    using NDHWGK = ck_tile::tensor_layout::convolution::NDHWGK;
+
+    if(in_layout == "NWGC" && wei_layout == "GKXC" && out_layout == "NWGK")
+    {
+        return run_grouped_conv_bwd_weight_example_with_layouts<ck_tile::number<1>{},
+                                                                InPrecType,
+                                                                WeiPrecType,
+                                                                OutPrecType>(
+            argc, argv, NWGC{}, GKXC{}, NWGK{});
+    }
+    else if(in_layout == "NHWGC" && wei_layout == "GKYXC" && out_layout == "NHWGK")
+    {
+        return run_grouped_conv_bwd_weight_example_with_layouts<ck_tile::number<2>{},
+                                                                InPrecType,
+                                                                WeiPrecType,
+                                                                OutPrecType>(
+            argc, argv, NHWGC{}, GKYXC{}, NHWGK{});
+    }
+    else if(in_layout == "NDHWGC" && wei_layout == "GKZYXC" && out_layout == "NDHWGK")
+    {
+        return run_grouped_conv_bwd_weight_example_with_layouts<ck_tile::number<3>{},
+                                                                InPrecType,
+                                                                WeiPrecType,
+                                                                OutPrecType>(
+            argc, argv, NDHWGC{}, GKZYXC{}, NDHWGK{});
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported memory layout!");
+    }
+}
+
+int run_grouped_conv_bwd_weight_example(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    std::string data_type  = arg_parser.get_str("prec");
+    std::string in_layout  = arg_parser.get_str("in_layout");
+    std::string wei_layout = arg_parser.get_str("wei_layout");
+    std::string out_layout = arg_parser.get_str("out_layout");
+
+    if(data_type == "fp16")
+    {
+        return run_grouped_conv_bwd_weight_example_prec_type<ck_tile::half_t>(
+            in_layout, wei_layout, out_layout, argc, argv);
+    }
+    else if(data_type == "bf16")
+    {
+        return run_grouped_conv_bwd_weight_example_prec_type<ck_tile::bf16_t>(
+            in_layout, wei_layout, out_layout, argc, argv);
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported data type for this operation!");
+    }
+}
+
+int main(int argc, char* argv[]) { return !run_grouped_conv_bwd_weight_example(argc, argv); }
--- a/example/ck_tile/20_grouped_convolution/grouped_convolution_forward.cpp
+++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_forward.cpp
@@ -0,0 +1,207 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <hip/hip_runtime.h>
+
+#include <cstring>
+#include <iostream>
+#include <ostream>
+#include <string>
+#include <tuple>
+
+#include "ck_tile/host.hpp"
+#include "grouped_convolution_utils.hpp"
+
+template <ck_tile::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename AccDataType,
+          typename OutDataType,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout,
+          typename DsDataType     = ck_tile::tuple<>,
+          typename DsLayout       = ck_tile::tuple<>,
+          typename CDEElementWise = ck_tile::element_wise::PassThrough>
+float grouped_conv_fwd(const ck_tile::GroupedConvFwdHostArgs& args, const ck_tile::stream_config& s)
+{
+    constexpr int kBlockPerCu = 1;
+
+    constexpr ck_tile::index_t M_Tile = 64;
+    constexpr ck_tile::index_t N_Tile = 64;
+    constexpr ck_tile::index_t K_Tile = 64;
+
+    constexpr ck_tile::index_t M_Warp = 2;
+    constexpr ck_tile::index_t N_Warp = 2;
+    constexpr ck_tile::index_t K_Warp = 1;
+
+    constexpr ck_tile::index_t M_Warp_Tile = 32;
+    constexpr ck_tile::index_t N_Warp_Tile = 32;
+    constexpr ck_tile::index_t K_Warp_Tile = 16;
+
+    constexpr ck_tile::index_t VectorSizeA = 8;
+    constexpr ck_tile::index_t VectorSizeB = 8;
+    constexpr ck_tile::index_t VectorSizeC = 8;
+
+    // Implicit GEMM Traits
+    using CodegenShape =
+        ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
+                               ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
+                               ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
+
+    constexpr auto ConvSpec = ck_tile::ConvolutionSpecialization::Default;
+    using TilePartitioner   = ck_tile::GemmTile1DPartitioner<CodegenShape>;
+    using GroupedConvTraitsType =
+        ck_tile::GroupedConvTraits<NDimSpatial, ConvSpec, InLayout, WeiLayout, DsLayout, OutLayout>;
+    using CodegenPipelineProblem =
+        ck_tile::GemmPipelineProblem<InDataType,
+                                     WeiDataType,
+                                     AccDataType,
+                                     CodegenShape,
+                                     typename GroupedConvTraitsType::GroupedConvImplicitGemmTraits,
+                                     InDataType,
+                                     true,
+                                     VectorSizeA,
+                                     VectorSizeB>;
+    using CodegenPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem>;
+
+    const auto Run = [&](const auto memory_operation_) {
+        constexpr auto memory_operation = memory_operation_.value;
+
+        using ConvEpilogue = ck_tile::CShuffleEpilogue<
+            ck_tile::CShuffleEpilogueProblem<InDataType,
+                                             WeiDataType,
+                                             DsDataType,
+                                             AccDataType,
+                                             OutDataType,
+                                             typename GroupedConvTraitsType::ImplicitGemmDsLayout,
+                                             ck_tile::tensor_layout::gemm::RowMajor,
+                                             CDEElementWise,
+                                             CodegenPipelineProblem::kBlockSize,
+                                             TilePartitioner::MPerBlock,
+                                             TilePartitioner::NPerBlock,
+                                             M_Warp,
+                                             N_Warp,
+                                             M_Warp_Tile,
+                                             N_Warp_Tile,
+                                             K_Warp_Tile,
+                                             CodegenPipelineProblem::TransposeC,
+                                             memory_operation,
+                                             1,
+                                             true,
+                                             VectorSizeC>>;
+
+        using Kernel = ck_tile::GroupedConvolutionForwardKernel<GroupedConvTraitsType,
+                                                                TilePartitioner,
+                                                                CodegenPipeline,
+                                                                ConvEpilogue>;
+        auto kargs   = Kernel::MakeKernelArgs(args);
+
+        const dim3 grids      = Kernel::GridSize(kargs);
+        constexpr dim3 blocks = Kernel::BlockSize();
+
+        if(!Kernel::IsSupportedArgument(kargs))
+        {
+            throw std::runtime_error("Wrong! Arguments not supported! Skipping conv!\n");
+        }
+
+        if(s.log_level_ > 0)
+        {
+            std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
+                      << "shape: " << CodegenShape::GetName() << '\n'
+                      << "problem: " << CodegenPipelineProblem::GetName() << '\n'
+                      << "pipeline: " << CodegenPipeline::GetName() << '\n'
+                      << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
+                      << '\n'
+                      << "Vector size A: " << CodegenPipeline::GetVectorSizeA()
+                      << ", Vector size B: " << CodegenPipeline::GetVectorSizeB()
+                      << ", Vector size C: " << ConvEpilogue::GetVectorSizeC() << std::endl;
+        }
+
+        float ave_time = ck_tile::launch_kernel(
+            s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+
+        return ave_time;
+    };
+
+    return Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                          ck_tile::memory_operation_enum::set>{});
+}
+
+#include "run_grouped_convolution_fwd_example.inc"
+
+template <typename InPrecType, typename WeiPrecType = InPrecType, typename OutPrecType = InPrecType>
+int run_grouped_conv_fwd_example_prec_type(
+    std::string in_layout, std::string wei_layout, std::string out_layout, int argc, char* argv[])
+{
+    using NWGC   = ck_tile::tensor_layout::convolution::NWGC;
+    using NHWGC  = ck_tile::tensor_layout::convolution::NHWGC;
+    using NDHWGC = ck_tile::tensor_layout::convolution::NDHWGC;
+
+    using GKXC   = ck_tile::tensor_layout::convolution::GKXC;
+    using GKYXC  = ck_tile::tensor_layout::convolution::GKYXC;
+    using GKZYXC = ck_tile::tensor_layout::convolution::GKZYXC;
+
+    using NWGK   = ck_tile::tensor_layout::convolution::NWGK;
+    using NHWGK  = ck_tile::tensor_layout::convolution::NHWGK;
+    using NDHWGK = ck_tile::tensor_layout::convolution::NDHWGK;
+
+    if(in_layout == "NWGC" && wei_layout == "GKXC" && out_layout == "NWGK")
+    {
+        return run_grouped_conv_fwd_example_with_layouts<ck_tile::number<1>{},
+                                                         InPrecType,
+                                                         WeiPrecType,
+                                                         OutPrecType>(
+            argc, argv, NWGC{}, GKXC{}, NWGK{});
+    }
+    else if(in_layout == "NHWGC" && wei_layout == "GKYXC" && out_layout == "NHWGK")
+    {
+        return run_grouped_conv_fwd_example_with_layouts<ck_tile::number<2>{},
+                                                         InPrecType,
+                                                         WeiPrecType,
+                                                         OutPrecType>(
+            argc, argv, NHWGC{}, GKYXC{}, NHWGK{});
+    }
+    else if(in_layout == "NDHWGC" && wei_layout == "GKZYXC" && out_layout == "GKZYXC")
+    {
+        return run_grouped_conv_fwd_example_with_layouts<ck_tile::number<3>{},
+                                                         InPrecType,
+                                                         WeiPrecType,
+                                                         OutPrecType>(
+            argc, argv, NDHWGC{}, GKZYXC{}, NDHWGK{});
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported memory layout!");
+    }
+}
+
+int run_grouped_conv_fwd_example(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    std::string data_type  = arg_parser.get_str("prec");
+    std::string in_layout  = arg_parser.get_str("in_layout");
+    std::string wei_layout = arg_parser.get_str("wei_layout");
+    std::string out_layout = arg_parser.get_str("out_layout");
+
+    if(data_type == "fp16")
+    {
+        return run_grouped_conv_fwd_example_prec_type<ck_tile::half_t>(
+            in_layout, wei_layout, out_layout, argc, argv);
+    }
+    else if(data_type == "bf16")
+    {
+        return run_grouped_conv_fwd_example_prec_type<ck_tile::bf16_t>(
+            in_layout, wei_layout, out_layout, argc, argv);
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported data type for this operation !!!");
+    }
+}
+
+int main(int argc, char* argv[]) { return !run_grouped_conv_fwd_example(argc, argv); }
--- a/example/ck_tile/20_grouped_convolution/grouped_convolution_utils.hpp
+++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_utils.hpp
@@ -0,0 +1,131 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <string>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/epilogue.hpp"
+#include "ck_tile/ops/gemm.hpp"
+#include "ck_tile/ops/grouped_convolution.hpp"
+
+template <typename InDataType, typename WeiDataType, typename AccDataType, typename OutDataType>
+auto calculate_rtol_atol(const ck_tile::index_t GemmK,
+                         const ck_tile::index_t kbatch,
+                         const float max_accumulated_value)
+{
+    using ComputeType =
+        std::conditional_t<sizeof(InDataType) < sizeof(WeiDataType), InDataType, WeiDataType>;
+    // Calculate thresholds
+    const auto rtol = ck_tile::get_relative_threshold<ComputeType, OutDataType, AccDataType>(
+        ck_tile::integer_divide_ceil(GemmK, kbatch));
+    const auto atol = ck_tile::get_absolute_threshold<ComputeType, OutDataType, AccDataType>(
+        max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(GemmK, kbatch));
+    // Calculate error due to split_k accumulation
+    const auto rtol_split_k =
+        ck_tile::get_relative_threshold<OutDataType, OutDataType, OutDataType>(kbatch);
+    const auto atol_split_k =
+        ck_tile::get_absolute_threshold<OutDataType, OutDataType, OutDataType>(
+            max_accumulated_value, kbatch);
+    // Use higher threshold
+    return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
+}
+
+ck_tile::index_t fill_spatial_dimensions(std::vector<ck_tile::index_t>& filter_spatial_lengths,
+                                         std::vector<ck_tile::index_t>& image_spatial_lengths,
+                                         std::vector<ck_tile::index_t>& strides,
+                                         std::vector<ck_tile::index_t>& dilations,
+                                         std::vector<ck_tile::index_t>& lpads,
+                                         std::vector<ck_tile::index_t>& rpads,
+                                         ck_tile::ArgParser& arg_parser)
+{
+
+    constexpr ck_tile::index_t non_sp_dims = 3;
+    const ck_tile::index_t n_dim_sp        = arg_parser.get_str("in_layout").size() - non_sp_dims;
+
+    if(!(n_dim_sp >= 1 && n_dim_sp <= 3))
+    {
+        throw std::runtime_error("Wrong layout!\n");
+    }
+
+    if(n_dim_sp == 3)
+    {
+        filter_spatial_lengths.push_back(arg_parser.get_int("z"));
+        image_spatial_lengths.push_back(arg_parser.get_int("d"));
+        strides.push_back(arg_parser.get_int("stride_d"));
+        dilations.push_back(arg_parser.get_int("dilation_d"));
+        lpads.push_back(arg_parser.get_int("lpad_d"));
+        rpads.push_back(arg_parser.get_int("rpad_d"));
+    }
+    if(n_dim_sp >= 2)
+    {
+        filter_spatial_lengths.push_back(arg_parser.get_int("y"));
+        image_spatial_lengths.push_back(arg_parser.get_int("h"));
+        strides.push_back(arg_parser.get_int("stride_h"));
+        dilations.push_back(arg_parser.get_int("dilation_h"));
+        lpads.push_back(arg_parser.get_int("lpad_h"));
+        rpads.push_back(arg_parser.get_int("rpad_h"));
+    }
+    filter_spatial_lengths.push_back(arg_parser.get_int("x"));
+    image_spatial_lengths.push_back(arg_parser.get_int("w"));
+    strides.push_back(arg_parser.get_int("stride_w"));
+    dilations.push_back(arg_parser.get_int("dilation_w"));
+    lpads.push_back(arg_parser.get_int("lpad_w"));
+    rpads.push_back(arg_parser.get_int("rpad_w"));
+
+    return n_dim_sp;
+}
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("g", "2", "group dimension")
+        .insert("n", "32", "n dimension")
+        .insert("k", "32", "k dimension")
+        .insert("c", "32", "c dimension")
+
+        .insert("d", "64", "d dimension")
+        .insert("h", "64", "h dimension")
+        .insert("w", "64", "w dimension")
+
+        .insert("z", "4", "z dimension")
+        .insert("y", "4", "y dimension")
+        .insert("x", "4", "x dimension")
+
+        .insert("stride_d", "1", "d stride")
+        .insert("stride_h", "1", "h stride")
+        .insert("stride_w", "1", "w stride")
+
+        .insert("dilation_d", "1", "d dilation")
+        .insert("dilation_h", "1", "h dilation")
+        .insert("dilation_w", "1", "w dilation")
+
+        .insert("lpad_d", "0", "left pad for d dimension")
+        .insert("lpad_h", "0", "left pad for h dimension")
+        .insert("lpad_w", "0", "left pad for w dimension")
+
+        .insert("rpad_d", "0", "right pad for d dimension")
+        .insert("rpad_h", "0", "right pad for h dimension")
+        .insert("rpad_w", "0", "right pad for w dimension")
+
+        .insert("in_layout", "NHWGC", "Input image layout - NHWGC by default")
+        .insert("wei_layout", "GKYXC", "Weight layout - GKYXC by default")
+        .insert("out_layout", "NHWGK", "Output image layout - NHWGK by default")
+        .insert("v", "1", "0. No validation, 1. Validation on CPU, 2. Validation on GPU")
+        .insert("prec", "fp16", "data type. fp16/bf16/fp8/bf8")
+        .insert("warmup", "50", "number of iterations before benchmark the kernel")
+        .insert("repeat", "100", "number of iterations to benchmark the kernel")
+        .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
+        .insert("split_k", "1", "splitK value")
+        .insert("init", "0", "0:random, 1:linear, 2:constant(1)");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+// host API
+float grouped_conv_fwd(const ck_tile::GroupedConvFwdHostArgs& args,
+                       const ck_tile::stream_config& s);
--- a/example/ck_tile/20_grouped_convolution/run_grouped_convolution_bwd_weight_example.inc
+++ b/example/ck_tile/20_grouped_convolution/run_grouped_convolution_bwd_weight_example.inc
@@ -0,0 +1,187 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+
+template <ck_tile::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename AccDataType,
+          typename OutDataType,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout>
+float invoke_grouped_conv_bwd_weight(ck_tile::GroupedConvBwdWeightHostArgs& args,
+                                     int n_warmup,
+                                     int n_repeat)
+{
+    float ave_time = grouped_conv_bwd_weight<NDimSpatial,
+                                             InDataType,
+                                             WeiDataType,
+                                             AccDataType,
+                                             OutDataType,
+                                             InLayout,
+                                             WeiLayout,
+                                             OutLayout>(
+        args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});
+
+    std::size_t flop     = args.GetFlops();
+    std::size_t num_byte = args.GetByte<InDataType, WeiDataType, OutDataType>();
+    float tflops         = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec     = num_byte / 1.E6 / ave_time;
+
+    std::cout << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << std::endl;
+
+    return ave_time;
+}
+
+template <ck_tile::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType = InDataType,
+          typename OutDataType = InDataType,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout>
+int run_grouped_conv_bwd_weight_example_with_layouts(
+    int argc, char* argv[], const InLayout, const WeiLayout, const OutLayout)
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    using AccDataType = float;
+
+    std::vector<ck_tile::index_t> filter_spatial_lengths;
+    std::vector<ck_tile::index_t> image_spatial_lengths;
+    std::vector<ck_tile::index_t> strides;
+    std::vector<ck_tile::index_t> dilations;
+    std::vector<ck_tile::index_t> lpads;
+    std::vector<ck_tile::index_t> rpads;
+
+    const ck_tile::index_t num_dim_sp = fill_spatial_dimensions(filter_spatial_lengths,
+                                                                image_spatial_lengths,
+                                                                strides,
+                                                                dilations,
+                                                                lpads,
+                                                                rpads,
+                                                                arg_parser);
+
+    ck_tile::conv::ConvParam conv_param{num_dim_sp,
+                                        arg_parser.get_int("g"),
+                                        arg_parser.get_int("n"),
+                                        arg_parser.get_int("k"),
+                                        arg_parser.get_int("c"),
+                                        filter_spatial_lengths,
+                                        image_spatial_lengths,
+                                        strides,
+                                        dilations,
+                                        lpads,
+                                        rpads};
+
+    ck_tile::index_t kbatch      = arg_parser.get_int("split_k");
+    int n_warmup                 = arg_parser.get_int("warmup");
+    int n_repeat                 = arg_parser.get_int("repeat");
+    ck_tile::index_t init_method = arg_parser.get_int("init");
+
+    const auto in_g_n_c_wis_desc =
+        ck_tile::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
+    const auto wei_g_k_c_xs_desc =
+        ck_tile::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
+    const auto out_g_n_k_wos_desc =
+        ck_tile::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
+
+    ck_tile::HostTensor<InDataType> input(in_g_n_c_wis_desc);
+    ck_tile::HostTensor<WeiDataType> weight(wei_g_k_c_xs_desc);
+    ck_tile::HostTensor<OutDataType> output(out_g_n_k_wos_desc);
+
+    if(init_method == 0)
+    {
+        ck_tile::FillUniformDistribution<InDataType>{-1.f, 1.f}(input);
+        ck_tile::FillUniformDistribution<OutDataType>{-1.f, 1.f}(output);
+    }
+    else if(init_method == 1)
+    {
+        ck_tile::FillMonotonicSeq<InDataType>{}(input);
+        ck_tile::FillMonotonicSeq<OutDataType>{}(output);
+    }
+    else if(init_method == 2)
+    {
+        ck_tile::FillUniformDistribution<InDataType>{1.f, 1.f}(input);
+        ck_tile::FillUniformDistribution<OutDataType>{1.f, 1.f}(output);
+    }
+    else
+    {
+        input.SetZero();
+        output.SetZero();
+    }
+
+    ck_tile::DeviceMem input_dev_buf(input.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem weight_dev_buf(weight.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem output_dev_buf(output.get_element_space_size_in_bytes());
+
+    input_dev_buf.ToDevice(input.data());
+    weight_dev_buf.SetZero();
+    output_dev_buf.ToDevice(output.data());
+
+    ck_tile::GroupedConvBwdWeightHostArgs args(conv_param,
+                                               input_dev_buf.GetDeviceBuffer(),
+                                               weight_dev_buf.GetDeviceBuffer(),
+                                               {},
+                                               output_dev_buf.GetDeviceBuffer(),
+                                               kbatch);
+
+    std::cout << "Run Grouped Conv Fwd kernel" << std::endl;
+    std::cout << "input: " << input.mDesc << std::endl;
+    std::cout << "weight: " << weight.mDesc << std::endl;
+    std::cout << "output: " << output.mDesc << std::endl;
+
+    invoke_grouped_conv_bwd_weight<NDimSpatial,
+                                   InDataType,
+                                   WeiDataType,
+                                   AccDataType,
+                                   OutDataType,
+                                   InLayout,
+                                   WeiLayout,
+                                   OutLayout>(args, n_warmup, n_repeat);
+
+    weight_dev_buf.FromDevice(weight.data());
+    bool pass = true;
+
+    if(arg_parser.get_int("v") == 1)
+    {
+        ck_tile::HostTensor<WeiDataType> weight_host_ref(wei_g_k_c_xs_desc);
+        weight_host_ref.SetZero();
+
+        ck_tile::
+            reference_grouped_conv_bwd_weight<NDimSpatial, InDataType, WeiDataType, OutDataType>(
+                input,
+                weight_host_ref,
+                output,
+                conv_param.conv_filter_strides_,
+                conv_param.conv_filter_dilations_,
+                conv_param.input_left_pads_,
+                conv_param.input_right_pads_);
+        const ck_tile::index_t GemmK = weight.get_element_size() / (conv_param.G_ * conv_param.K_);
+        const float max_accumulated_value =
+            *std::max_element(weight_host_ref.mData.begin(), weight_host_ref.mData.end());
+        const auto rtol_atol =
+            calculate_rtol_atol<InDataType, WeiDataType, AccDataType, OutDataType>(
+                GemmK, kbatch, max_accumulated_value);
+        pass = ck_tile::check_err(weight,
+                                  weight_host_ref,
+                                  "Error: Incorrect results!",
+                                  rtol_atol.at(ck_tile::number<0>{}),
+                                  rtol_atol.at(ck_tile::number<1>{}));
+
+        std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
+                  << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
+                  << std::endl;
+        std::cout << "The CPU verification result is:" << (pass ? "correct" : "fail") << std::endl;
+    }
+    else if(arg_parser.get_int("v") == 2)
+    {
+        throw std::runtime_error("Unsupported gpu verification !!!");
+    }
+
+    return pass;
+}
--- a/example/ck_tile/20_grouped_convolution/run_grouped_convolution_fwd_example.inc
+++ b/example/ck_tile/20_grouped_convolution/run_grouped_convolution_fwd_example.inc
@@ -0,0 +1,186 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+
+template <ck_tile::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename AccDataType,
+          typename OutDataType,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout>
+float invoke_grouped_conv_fwd(const ck_tile::GroupedConvFwdHostArgs& args,
+                              int n_warmup,
+                              int n_repeat)
+{
+    float ave_time = grouped_conv_fwd<NDimSpatial,
+                                      InDataType,
+                                      WeiDataType,
+                                      AccDataType,
+                                      OutDataType,
+                                      InLayout,
+                                      WeiLayout,
+                                      OutLayout>(
+        args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});
+
+    std::size_t flop     = args.GetFlops();
+    std::size_t num_byte = args.GetByte<InDataType, WeiDataType, OutDataType>();
+    float tflops         = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec     = num_byte / 1.E6 / ave_time;
+
+    std::cout << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << std::endl;
+
+    return ave_time;
+}
+
+template <ck_tile::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType = InDataType,
+          typename OutDataType = InDataType,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout>
+int run_grouped_conv_fwd_example_with_layouts(
+    int argc, char* argv[], const InLayout, const WeiLayout, const OutLayout)
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    using AccDataType = float;
+
+    std::vector<ck_tile::index_t> filter_spatial_lengths;
+    std::vector<ck_tile::index_t> image_spatial_lengths;
+    std::vector<ck_tile::index_t> strides;
+    std::vector<ck_tile::index_t> dilations;
+    std::vector<ck_tile::index_t> lpads;
+    std::vector<ck_tile::index_t> rpads;
+
+    const ck_tile::index_t num_dim_sp = fill_spatial_dimensions(filter_spatial_lengths,
+                                                                image_spatial_lengths,
+                                                                strides,
+                                                                dilations,
+                                                                lpads,
+                                                                rpads,
+                                                                arg_parser);
+
+    ck_tile::conv::ConvParam conv_param{num_dim_sp,
+                                        arg_parser.get_int("g"),
+                                        arg_parser.get_int("n"),
+                                        arg_parser.get_int("k"),
+                                        arg_parser.get_int("c"),
+                                        filter_spatial_lengths,
+                                        image_spatial_lengths,
+                                        strides,
+                                        dilations,
+                                        lpads,
+                                        rpads};
+
+    ck_tile::index_t kbatch      = arg_parser.get_int("split_k");
+    int n_warmup                 = arg_parser.get_int("warmup");
+    int n_repeat                 = arg_parser.get_int("repeat");
+    ck_tile::index_t init_method = arg_parser.get_int("init");
+
+    const auto in_g_n_c_wis_desc =
+        ck_tile::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
+    const auto wei_g_k_c_xs_desc =
+        ck_tile::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
+    const auto out_g_n_k_wos_desc =
+        ck_tile::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
+
+    ck_tile::HostTensor<InDataType> input(in_g_n_c_wis_desc);
+    ck_tile::HostTensor<WeiDataType> weight(wei_g_k_c_xs_desc);
+    ck_tile::HostTensor<OutDataType> output(out_g_n_k_wos_desc);
+
+    if(init_method == 0)
+    {
+        ck_tile::FillUniformDistribution<InDataType>{-5.f, 5.f}(input);
+        ck_tile::FillUniformDistribution<WeiDataType>{-5.f, 5.f}(weight);
+    }
+    else if(init_method == 1)
+    {
+        ck_tile::FillMonotonicSeq<InDataType>{}(input);
+        ck_tile::FillMonotonicSeq<WeiDataType>{}(weight);
+    }
+    else if(init_method == 2)
+    {
+        ck_tile::FillUniformDistribution<InDataType>{1.f, 1.f}(input);
+        ck_tile::FillUniformDistribution<WeiDataType>{1.f, 1.f}(weight);
+    }
+    else
+    {
+        input.SetZero();
+        weight.SetZero();
+    }
+
+    ck_tile::DeviceMem input_dev_buf(input.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem weight_dev_buf(weight.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem output_dev_buf(output.get_element_space_size_in_bytes());
+
+    input_dev_buf.ToDevice(input.data());
+    weight_dev_buf.ToDevice(weight.data());
+    output_dev_buf.SetZero();
+
+    ck_tile::GroupedConvFwdHostArgs args(conv_param,
+                                         input_dev_buf.GetDeviceBuffer(),
+                                         weight_dev_buf.GetDeviceBuffer(),
+                                         {},
+                                         output_dev_buf.GetDeviceBuffer(),
+                                         kbatch);
+
+    std::cout << "Run Grouped Conv Fwd kernel" << std::endl;
+    std::cout << "input: " << input.mDesc << std::endl;
+    std::cout << "weight: " << weight.mDesc << std::endl;
+    std::cout << "output: " << output.mDesc << std::endl;
+
+    invoke_grouped_conv_fwd<NDimSpatial,
+                            InDataType,
+                            WeiDataType,
+                            AccDataType,
+                            OutDataType,
+                            InLayout,
+                            WeiLayout,
+                            OutLayout>(args, n_warmup, n_repeat);
+
+    output_dev_buf.FromDevice(output.data());
+    bool pass = true;
+
+    if(arg_parser.get_int("v") == 1)
+    {
+        ck_tile::HostTensor<OutDataType> output_host_ref(out_g_n_k_wos_desc);
+        output_host_ref.SetZero();
+
+        ck_tile::reference_grouped_conv_fwd<NDimSpatial, InDataType, WeiDataType, OutDataType>(
+            input,
+            weight,
+            output_host_ref,
+            conv_param.conv_filter_strides_,
+            conv_param.conv_filter_dilations_,
+            conv_param.input_left_pads_,
+            conv_param.input_right_pads_);
+        const ck_tile::index_t GemmK = weight.get_element_size() / (conv_param.G_ * conv_param.K_);
+        const float max_accumulated_value =
+            *std::max_element(output_host_ref.mData.begin(), output_host_ref.mData.end());
+        const auto rtol_atol =
+            calculate_rtol_atol<InDataType, WeiDataType, AccDataType, OutDataType>(
+                GemmK, kbatch, max_accumulated_value);
+        pass = ck_tile::check_err(output,
+                                  output_host_ref,
+                                  "Error: Incorrect results!",
+                                  rtol_atol.at(ck_tile::number<0>{}),
+                                  rtol_atol.at(ck_tile::number<1>{}));
+
+        std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
+                  << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
+                  << std::endl;
+        std::cout << "The CPU verification result is:" << (pass ? "correct" : "fail") << std::endl;
+    }
+    else if(arg_parser.get_int("v") == 2)
+    {
+        throw std::runtime_error("Unsupported gpu verification !!!");
+    }
+
+    return pass;
+}
--- a/example/ck_tile/21_elementwise/CMakeLists.txt
+++ b/example/ck_tile/21_elementwise/CMakeLists.txt
@@ -0,0 +1,15 @@
+# Elementwise example targets 2D inputs
+set(TARGET_NAME_2D_INPUT tile_example_elementwise)
+add_executable(${TARGET_NAME_2D_INPUT} elementwise_example.cpp)
+
+# Elementwise unary example targets 2D inputs
+set(TARGET_NAME_2D_INPUT_UNARY tile_example_elementwise_unary)
+add_executable(${TARGET_NAME_2D_INPUT_UNARY} elementwise_example_unary.cpp)
+
+# Elementwise transpose example targets 2D inputs
+set(TARGET_NAME_2D_INPUT_TRANSPOSE tile_example_elementwise_transpose)
+add_executable(${TARGET_NAME_2D_INPUT_TRANSPOSE} elementwise_example_transpose.cpp)
+
+# Elementwise example targets 4D inputs
+set(TARGET_NAME_4D_INPUT tile_example_elementwise_add_4d)
+add_executable(${TARGET_NAME_4D_INPUT} elementwise_example_add_4d.cpp)
--- a/example/ck_tile/21_elementwise/elementwise_example.cpp
+++ b/example/ck_tile/21_elementwise/elementwise_example.cpp
@@ -0,0 +1,214 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck_tile/core/arch/arch.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/ops/elementwise.hpp"
+#include "ck_tile/host/reference/reference_elementwise.hpp"
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("m", "1024", "m dimension")
+        .insert("n", "1024", "n dimension")
+        .insert("stride", "-1", "stride per row, if -1 then equal to n")
+        .insert("v", "1", "cpu validation or not")
+        .insert("prec", "fp16", "precision")
+        .insert("warmup", "10", "cold iter")
+        .insert("repeat", "50", "hot iter");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+template <typename DataType>
+bool run(const ck_tile::ArgParser& arg_parser)
+{
+    ck_tile::index_t M      = arg_parser.get_int("m");
+    ck_tile::index_t N      = arg_parser.get_int("n");
+    ck_tile::index_t stride = arg_parser.get_int("stride");
+
+    // If stride is negative (default -1), set it to N, assuming a dense row-major layout.
+    if(stride < 0)
+        stride = N;
+    std::string data_type = arg_parser.get_str("prec");
+    int do_validation     = arg_parser.get_int("v");
+    int warmup            = arg_parser.get_int("warmup");
+    int repeat            = arg_parser.get_int("repeat");
+
+    if(stride < N)
+    {
+        throw std::runtime_error("stride must be >= N");
+    }
+
+    // Define type aliases for clarity.
+    // XDataType: Data type of the input tensors.
+    // ComputeDataType: Data type used for intermediate computations (often float for precision).
+    // YDataType: Data type of the output tensor.
+    // XElementwiseOperation: The specific elementwise operation to perform (e.g., Add, Mul).
+    using XDataType = DataType;
+    using ComputeDataType =
+        float; // Using float for intermediate calculations can improve numerical stability.
+    using YDataType             = DataType;
+    using XElementwiseOperation = ck_tile::element_wise::Add;
+
+    // 1. Initialize the input data on the host (CPU).
+    // HostTensor is a utility to manage tensor data on the CPU.
+    // The first argument is the shape (dimensions) of the tensor {M, N}.
+    // The second argument is the strides {stride, 1} for row-major layout.
+    // 'x_host_a' and 'x_host_b' are the two input tensors for the elementwise operation.
+    ck_tile::HostTensor<XDataType> x_host_a({M, N}, {stride, 1});
+    ck_tile::HostTensor<XDataType> x_host_b({M, N}, {stride, 1});
+    ck_tile::HostTensor<YDataType> y_host({M, N}, {stride, 1});
+    ck_tile::HostTensor<YDataType> y_validation({M, N}, {stride, 1});
+
+    std::vector<ck_tile::index_t> shape = {M, N};
+
+    // Fill the host tensors with random data.
+    // FillUniformDistribution populates the tensor with values from a uniform distribution,
+    // within an interval.
+    ck_tile::FillUniformDistribution<XDataType>{0.f, 5.f}(x_host_a);
+    ck_tile::FillUniformDistribution<XDataType>{0.f, 5.f}(x_host_b);
+
+    // 2. Create device memory buffers
+    // DeviceMem allocates memory on the GPU.
+    // The size is determined by the total number of elements and the size of DataType.
+    ck_tile::DeviceMem x_buf_a(x_host_a.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem x_buf_b(x_host_b.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem y_buf(y_host.get_element_space_size_in_bytes());
+
+    // Copy data from host input tensors to device buffers.
+    x_buf_a.ToDevice(x_host_a.data());
+    x_buf_b.ToDevice(x_host_b.data());
+
+    // 3. Configure the kernel execution parameters.
+    // Dividing the problem into blocktile, blockwarp and warptile
+    // The blocktile is the size of the tile processed by a single work group (also called thread
+    // block). The warptile is the size of the tile processed by a single wavefront (also called
+    // warp). The vector is the size of the tile processed by a single work item (also called
+    // thread). The problem is divided into blocks of size BlockTile. Each block is further divided
+    // into wavefronts of size WarpTile. Each wavefront is composed of 64 work items (on AMD; 32
+    // threads on NVIDIA). Each work item in a wavefront processes one vector's worth of elements.
+    // Note that WarpTile/Vector should be 64 for CDNA (because there are 64 work items per
+    // wavefront). Vector size is set to be 16 / sizeof(ComputeDataType), to maximize vectorization.
+    using BlockTile = ck_tile::sequence<2048>; // How many elements are handled by a block tile (the
+                                               // tensor is divided into blocks of this size)
+    using BlockWarps = ck_tile::sequence<8>; // How many concurrent wavefronts are in a block (each
+                                             // wavefront will cover some part of the block tile)
+
+    // WarpTile: Defines the size of the data sub-tile processed by a single wavefront.
+    // This should be consistent with BlockTile and BlockWarps.
+    // If BlockTile is 2048 and BlockWarps is 8, then WarpTile could be 2048/8 = 256.
+    // However, this example uses 64, meaning each wavefront processes 64 elements, and multiple
+    // such wavefront operations might be needed to cover the BlockTile, or the BlockTile is
+    // distributed differently.
+    // The current configuration (BlockTile=2048, BlockWarps=8, WarpTile=64) implies that
+    // each wavefront processes 64 elements, and 8 wavefronts process 8*64 = 512 elements
+    // concurrently. Since 512 is not equal to 2048, it means that warptile(s) will need to iterate
+    // over multiple times over different set of elements to cover the entire BlockTile.
+    using WarpTile = ck_tile::sequence<64>;
+
+    // 4. Create the kernel
+
+    // ElementWiseShape bundles these tiling parameters.
+    // It calculates derived properties like threads per wavefront, repeats, vectorization and total
+    // block size.
+    using Shape = ck_tile::ElementWiseShape<BlockWarps, BlockTile, WarpTile, XDataType>;
+
+    // ElementWisePipelineProblem encapsulates all necessary information for the elementwise kernel:
+    // - Data types (input, compute, output).
+    // - Shape traits (tiling configuration).
+    // - The specific elementwise operation (e.g., Add).
+    using Problem = ck_tile::ElementWisePipelineProblem<XDataType,
+                                                        ComputeDataType,
+                                                        YDataType,
+                                                        Shape,
+                                                        XElementwiseOperation>;
+
+    // ElementWiseKernel refers to the GPU kernel class
+    using Kernel = ck_tile::ElementWiseKernel<Problem, ck_tile::ElementWiseDefaultPolicy>;
+
+    // Compute flattened size
+    ck_tile::index_t total_elements = 1;
+    for(auto d : shape)
+        total_elements *= d;
+
+    // kBlockSize: The number of work items in a GPU workgroup (thread block).
+    // This is often a multiple of the wavefront size, 64 on CDNA.
+    // Here, it's explicitly set to 512. This should be consistent with Shape::kBlockSize.
+    // Shape::kBlockSize would be BlockWarps * warpSize (e.g., 8 * 64 = 512).
+    constexpr ck_tile::index_t kBlockSize =
+        ck_tile::get_warp_size() * BlockWarps::at(ck_tile::number<0>{});
+
+    // kBlockPerCu: Hint for how many workgroups can be scheduled per Compute Unit (CU).
+    // This can influence occupancy and performance.
+    constexpr ck_tile::index_t kBlockPerCu = 1;
+
+    // kGridSize: Calculates the total number of workgroups required to process all elements.
+    // Each workgroup is responsible for 'elements_per_block' elements.
+    // To ensure all elements are covered, especially when 'total_elements' is not perfectly
+    // divisible by 'elements_per_block', using ceiling division.
+    constexpr ck_tile::index_t elements_per_block = BlockTile::at(ck_tile::number<0>{});
+    ck_tile::index_t kGridSize = (total_elements + elements_per_block - 1) / elements_per_block;
+
+    std::cout << "grid size = " << kGridSize << std::endl;
+    std::cout << "Total elements = " << total_elements << std::endl;
+
+    auto input_tensors = ck_tile::make_tuple(static_cast<XDataType*>(x_buf_a.GetDeviceBuffer()),
+                                             static_cast<XDataType*>(x_buf_b.GetDeviceBuffer()));
+
+    auto input_size = ck_tile::make_tuple(M, N);
+
+    // Check if the kernel configuration is supported
+    if(!Kernel::IsSupportedArgument(input_size))
+    {
+        throw std::runtime_error(
+            "The kernel configuration is not supported for the given input size.");
+    }
+
+    // 4. Run the kernel
+    float ave_time = launch_kernel(ck_tile::stream_config{nullptr, true, 0, warmup, repeat},
+                                   ck_tile::make_kernel<kBlockSize, kBlockPerCu>(
+                                       Kernel{},
+                                       kGridSize,
+                                       kBlockSize,
+                                       0,
+                                       input_size,
+                                       ck_tile::make_tuple(N, 1), // Input Stride
+                                       ck_tile::make_tuple(N, 1), // Output Stride
+                                       input_tensors,
+                                       static_cast<YDataType*>(y_buf.GetDeviceBuffer())));
+
+    std::cout << "Average time: " << ave_time << " ms" << std::endl;
+
+    // 5. Verify the output
+    bool pass = true;
+    if(do_validation)
+    {
+        y_buf.FromDevice(y_validation.data());
+        auto op = [](const auto& v0, const auto& v1) { return v0 + v1; };
+
+        ck_tile::reference_binary_elementwise<XDataType, XDataType, YDataType, ComputeDataType>(
+            x_host_a, x_host_b, y_host, op);
+
+        pass = ck_tile::check_err(
+            y_validation, y_host, "Elementwise Add Error: Incorrect results!", 0.01, 0.01);
+    }
+
+    return pass;
+}
+
+int main(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    const std::string data_type = arg_parser.get_str("prec");
+    if(data_type == "fp16")
+    {
+        return run<ck_tile::half_t>(arg_parser) ? 0 : -2;
+    }
+
+    return -3;
+}
--- a/example/ck_tile/21_elementwise/elementwise_example_add_4d.cpp
+++ b/example/ck_tile/21_elementwise/elementwise_example_add_4d.cpp
@@ -0,0 +1,159 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck_tile/core/arch/arch.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/ops/elementwise.hpp"
+#include "ck_tile/host/reference/reference_elementwise.hpp"
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("dim0", "4", "dimension 0")
+        .insert("dim1", "16", "dimension 1")
+        .insert("dim2", "32", "dimension 2")
+        .insert("dim3", "32", "dimension 3")
+        .insert("v", "1", "cpu validation or not")
+        .insert("prec", "fp16", "precision")
+        .insert("warmup", "10", "cold iter")
+        .insert("repeat", "50", "hot iter");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+template <typename DataType>
+bool run(const ck_tile::ArgParser& arg_parser)
+{
+    ck_tile::index_t D0 = arg_parser.get_int("dim0");
+    ck_tile::index_t D1 = arg_parser.get_int("dim1");
+    ck_tile::index_t D2 = arg_parser.get_int("dim2");
+    ck_tile::index_t D3 = arg_parser.get_int("dim3");
+
+    std::string data_type = arg_parser.get_str("prec");
+    int do_validation     = arg_parser.get_int("v");
+    int warmup            = arg_parser.get_int("warmup");
+    int repeat            = arg_parser.get_int("repeat");
+
+    using XDataType = DataType;
+    using ComputeDataType =
+        float; // Using float for intermediate calculations can improve numerical stability.
+    using YDataType             = DataType;
+    using XElementwiseOperation = ck_tile::element_wise::Add;
+
+    // Initialize the input data on the host (CPU).
+    std::vector<ck_tile::index_t> problem_shape = {D0, D1, D2, D3};
+
+    std::vector<ck_tile::index_t> host_strides(4);
+    host_strides[3] = 1;
+    host_strides[2] = problem_shape[3];
+    host_strides[1] = problem_shape[2] * problem_shape[3];
+    host_strides[0] = problem_shape[1] * problem_shape[2] * problem_shape[3];
+
+    ck_tile::HostTensor<XDataType> x_host_a(problem_shape, host_strides);
+    ck_tile::HostTensor<XDataType> x_host_b(problem_shape, host_strides);
+    ck_tile::HostTensor<YDataType> y_host(problem_shape, host_strides);
+    ck_tile::HostTensor<YDataType> y_validation(problem_shape, host_strides);
+
+    ck_tile::FillUniformDistribution<XDataType>{0.f, 5.f}(x_host_a);
+    ck_tile::FillUniformDistribution<XDataType>{2.f, 10.f}(x_host_b);
+
+    ck_tile::DeviceMem x_buf_a(x_host_a.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem x_buf_b(x_host_b.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem y_buf(y_host.get_element_space_size_in_bytes());
+
+    x_buf_a.ToDevice(x_host_a.data());
+    x_buf_b.ToDevice(x_host_b.data());
+
+    using BlockTile  = ck_tile::sequence<256>;
+    using BlockWarps = ck_tile::sequence<1>;
+    using WarpTile   = ck_tile::sequence<256>;
+
+    using Shape = ck_tile::ElementWiseShape<BlockWarps, BlockTile, WarpTile, XDataType>;
+
+    using Problem = ck_tile::ElementWisePipelineProblem<XDataType,
+                                                        ComputeDataType,
+                                                        YDataType,
+                                                        Shape,
+                                                        XElementwiseOperation>;
+
+    using Kernel = ck_tile::ElementWiseKernel<Problem, ck_tile::ElementWiseDefaultPolicy>;
+
+    ck_tile::index_t total_elements = 1;
+    for(auto d : problem_shape)
+        total_elements *= d;
+
+    constexpr ck_tile::index_t kBlockSize =
+        ck_tile::get_warp_size() * BlockWarps::at(ck_tile::number<0>{});
+
+    constexpr ck_tile::index_t kBlockPerCu = 2;
+
+    constexpr ck_tile::index_t elements_per_block = BlockTile::at(ck_tile::number<0>{});
+    ck_tile::index_t kGridSize = (total_elements + elements_per_block - 1) / elements_per_block;
+
+    std::cout << "grid size = " << kGridSize << std::endl;
+    std::cout << "Total elements = " << total_elements << std::endl;
+
+    auto input_tensors = ck_tile::make_tuple(static_cast<XDataType*>(x_buf_a.GetDeviceBuffer()),
+                                             static_cast<XDataType*>(x_buf_b.GetDeviceBuffer()));
+
+    auto problem_shape_tuple =
+        ck_tile::make_tuple(problem_shape[0], problem_shape[1], problem_shape[2], problem_shape[3]);
+
+    auto strides_tuple =
+        ck_tile::make_tuple(host_strides[0], host_strides[1], host_strides[2], host_strides[3]);
+
+    // Check if the kernel configuration is supported
+    if(!Kernel::IsSupportedArgument(problem_shape_tuple))
+    {
+        throw std::runtime_error(
+            "The kernel configuration is not supported for the given input size.");
+    }
+
+    // Run the kernel
+    float ave_time = launch_kernel(
+        ck_tile::stream_config{nullptr, true, 0, warmup, repeat},
+        ck_tile::make_kernel<kBlockSize, kBlockPerCu>(
+            Kernel{},
+            kGridSize,
+            kBlockSize,
+            0,
+            problem_shape_tuple, // ck_tile::tuple<index_t, index_t, index_t, index_t>
+            strides_tuple, // ck_tile::tuple<index_t, index_t, index_t, index_t> for input strides
+            strides_tuple, // ck_tile::tuple<index_t, index_t, index_t, index_t> for output strides
+            input_tensors,
+            static_cast<YDataType*>(y_buf.GetDeviceBuffer())));
+
+    std::cout << "Average time: " << ave_time << " ms" << std::endl;
+
+    // Verify the output
+    bool pass = true;
+    if(do_validation)
+    {
+        y_buf.FromDevice(y_validation.data());
+        auto op = [](const auto& v0, const auto& v1) { return v0 + v1; };
+
+        ck_tile::reference_binary_elementwise<XDataType, XDataType, YDataType, ComputeDataType>(
+            x_host_a, x_host_b, y_host, op);
+
+        pass = ck_tile::check_err(
+            y_validation, y_host, "Elementwise Add Error: Incorrect results!", 0.01, 0.01);
+    }
+
+    return pass;
+}
+
+int main(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    const std::string data_type = arg_parser.get_str("prec");
+    if(data_type == "fp16")
+    {
+        return run<ck_tile::half_t>(arg_parser) ? 0 : -2;
+    }
+
+    return -3;
+}
--- a/example/ck_tile/21_elementwise/elementwise_example_transpose.cpp
+++ b/example/ck_tile/21_elementwise/elementwise_example_transpose.cpp
@@ -0,0 +1,157 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck_tile/host.hpp"
+#include "ck_tile/ops/elementwise.hpp"
+#include "ck_tile/host/reference/reference_transpose.hpp"
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("m", "1024", "m dimension of input")
+        .insert("n", "1024", "n dimension of input")
+        .insert("stride_in", "-1", "stride for input M dim, if -1 then equal to n")
+        .insert("v", "1", "cpu validation or not")
+        .insert("prec", "fp16", "precision")
+        .insert("warmup", "10", "cold iter")
+        .insert("repeat", "50", "hot iter");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+template <typename DataType>
+bool run(const ck_tile::ArgParser& arg_parser)
+{
+    ck_tile::index_t M         = arg_parser.get_int("m");
+    ck_tile::index_t N         = arg_parser.get_int("n");
+    ck_tile::index_t stride_in = arg_parser.get_int("stride_in");
+
+    if(stride_in < 0)
+        stride_in = N; // Dense input: stride for M dim is N
+    std::string data_type = arg_parser.get_str("prec");
+    int do_validation     = arg_parser.get_int("v");
+    int warmup            = arg_parser.get_int("warmup");
+    int repeat            = arg_parser.get_int("repeat");
+
+    if(stride_in < N)
+    {
+        throw std::runtime_error("stride_in must be >= N");
+    }
+
+    using XDataType       = DataType;
+    using ComputeDataType = float;
+    using YDataType       = DataType;
+    // Use PassThrough operation for transposition (data is moved, not changed)
+    using XElementwiseOperation = ck_tile::element_wise::PassThrough;
+
+    // 1. Initialize the input data on the host (CPU).
+    // Input x_host_a: M x N
+    // Output y_host: N x M (transposed)
+    ck_tile::HostTensor<XDataType> x_host_a({M, N}, {stride_in, 1});
+    // Output tensor y_host will have dimensions N x M.
+    // Assuming dense output, its stride for the N dimension will be M.
+    ck_tile::index_t stride_out_dim0 = M;
+    ck_tile::HostTensor<YDataType> y_host({N, M}, {stride_out_dim0, 1});
+    ck_tile::HostTensor<YDataType> y_validation({N, M}, {stride_out_dim0, 1});
+
+    // The logical shape for the element-wise operation kernel is based on the input tensor's
+    // elements.
+    std::vector<ck_tile::index_t> op_shape_vec = {M, N};
+    auto op_lengths                            = ck_tile::make_tuple(M, N); // Lens for the kernel
+
+    ck_tile::FillUniformDistribution<XDataType>{0.f, 5.f}(x_host_a);
+
+    // 2. Create device memory buffers
+    ck_tile::DeviceMem x_buf_a(x_host_a.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem y_buf(y_host.get_element_space_size_in_bytes()); // y_host is N x M
+
+    x_buf_a.ToDevice(x_host_a.data());
+
+    // 3. Configure the kernel execution parameters.
+    using BlockTile  = ck_tile::sequence<1024>;
+    using BlockWarps = ck_tile::sequence<8>;
+    using WarpTile   = ck_tile::sequence<64>;
+
+    using Shape = ck_tile::ElementWiseShape<BlockWarps, BlockTile, WarpTile, XDataType>;
+
+    // Problem definition for a single input tensor
+    using Problem = ck_tile::ElementWisePipelineProblem<XDataType,
+                                                        ComputeDataType,
+                                                        YDataType,
+                                                        Shape,
+                                                        XElementwiseOperation>;
+
+    using Kernel = ck_tile::ElementWiseKernel<Problem, ck_tile::ElementWiseDefaultPolicy>;
+
+    ck_tile::index_t total_elements = M * N;
+
+    constexpr ck_tile::index_t kBlockSize =
+        ck_tile::get_warp_size() * BlockWarps::at(ck_tile::number<0>{});
+    constexpr ck_tile::index_t kBlockPerCu        = 1;
+    constexpr ck_tile::index_t elements_per_block = BlockTile::at(ck_tile::number<0>{});
+    ck_tile::index_t kGridSize = (total_elements + elements_per_block - 1) / elements_per_block;
+
+    std::cout << "Input M=" << M << ", N=" << N << ", StrideIn=" << stride_in << std::endl;
+    std::cout << "Output N=" << N << ", M=" << M << ", StrideOut=" << stride_out_dim0 << std::endl;
+    std::cout << "Grid size = " << kGridSize << ", BlockSize = " << kBlockSize << std::endl;
+    std::cout << "Total elements = " << total_elements << std::endl;
+
+    // Input tensors tuple (single input)
+    auto input_tensors = ck_tile::make_tuple(static_cast<XDataType*>(x_buf_a.GetDeviceBuffer()));
+    // Input strides tuple (tuple of tuples, one for each input)
+    auto input_strides = ck_tile::make_tuple(stride_in, 1);
+    // Output strides (for N x M tensor, dense)
+    auto output_strides = ck_tile::make_tuple(1, stride_out_dim0);
+
+    // Check if the kernel configuration is supported
+    if(!Kernel::IsSupportedArgument(op_lengths))
+    {
+        throw std::runtime_error(
+            "The kernel configuration is not supported for the given input size.");
+    }
+
+    // 4. Run the kernel
+    float ave_time = launch_kernel(ck_tile::stream_config{nullptr, true, 0, warmup, repeat},
+                                   ck_tile::make_kernel<kBlockSize, kBlockPerCu>(
+                                       Kernel{},
+                                       kGridSize,
+                                       kBlockSize,
+                                       0,             // Shared memory
+                                       op_lengths,    // Logical dimensions for the operation (M, N)
+                                       input_strides, // Strides for input tensor(s)
+                                       output_strides, // Strides for output tensor (N, M)
+                                       input_tensors,
+                                       static_cast<YDataType*>(y_buf.GetDeviceBuffer())));
+
+    std::cout << "Average time: " << ave_time << " ms" << std::endl;
+
+    // 5. Verify the output
+    bool pass = true;
+    if(do_validation)
+    {
+        y_buf.FromDevice(y_validation.data()); // Copy result from device to y_validation
+        ck_tile::reference_transpose_elementwise<XDataType, YDataType>(
+            x_host_a, y_host); // Compute reference on host
+        pass = ck_tile::check_err(
+            y_validation, y_host, "Transpose Error: Incorrect results!", 0.01, 0.01);
+    }
+
+    return pass;
+}
+
+int main(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    const std::string data_type = arg_parser.get_str("prec");
+    if(data_type == "fp16")
+    {
+        return run<ck_tile::half_t>(arg_parser) ? 0 : -2;
+    }
+
+    std::cerr << "Unsupported data type: " << data_type << std::endl;
+    return -3;
+}
--- a/example/ck_tile/21_elementwise/elementwise_example_unary.cpp
+++ b/example/ck_tile/21_elementwise/elementwise_example_unary.cpp
@@ -0,0 +1,146 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck_tile/core/arch/arch.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/ops/elementwise.hpp"
+#include "ck_tile/host/reference/reference_elementwise.hpp"
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("m", "1024", "m dimension")
+        .insert("n", "1024", "n dimension")
+        .insert("stride", "-1", "stride per row, if -1 then equal to n")
+        .insert("v", "1", "cpu validation or not")
+        .insert("prec", "fp16", "precision")
+        .insert("warmup", "10", "cold iter")
+        .insert("repeat", "50", "hot iter");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+template <typename DataType>
+bool run(const ck_tile::ArgParser& arg_parser)
+{
+    ck_tile::index_t M      = arg_parser.get_int("m");
+    ck_tile::index_t N      = arg_parser.get_int("n");
+    ck_tile::index_t stride = arg_parser.get_int("stride");
+    if(stride < 0)
+        stride = N;
+    std::string data_type = arg_parser.get_str("prec");
+    int do_validation     = arg_parser.get_int("v");
+    int warmup            = arg_parser.get_int("warmup");
+    int repeat            = arg_parser.get_int("repeat");
+
+    assert(stride >= N);
+
+    using XDataType             = DataType;
+    using YDataType             = DataType;
+    using XElementwiseOperation = ck_tile::element_wise::UnarySquare;
+
+    // 1. Initialize the input data on the host
+    ck_tile::HostTensor<XDataType> x_host_a({M, N}, {stride, 1});
+    ck_tile::HostTensor<YDataType> y_host({M, N}, {stride, 1});
+    ck_tile::HostTensor<YDataType> y_validation({M, N}, {stride, 1});
+
+    std::vector<ck_tile::index_t> shape = {M, N};
+
+    ck_tile::FillUniformDistribution<XDataType>{0.f, 5.f}(x_host_a);
+
+    // 2. Create device memory buffers and copy input data from host to device
+    ck_tile::DeviceMem x_buf_a(x_host_a.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem y_buf(y_host.get_element_space_size_in_bytes());
+    x_buf_a.ToDevice(x_host_a.data());
+
+    // 3. Create the kernel
+
+    // Dividing the problem into blocktile, warptile, and vector
+    using BlockTile = ck_tile::sequence<2048>; // Size of the block tile (Entire problem is divided
+                                               // into blocks of this size)
+    using BlockWarps = ck_tile::sequence<8>; // How many concurrent warps are in a block (Each warp
+                                             // will cover some part of blockTile)
+    using WarpTile = ck_tile::sequence<64>;  // How many elements are covered by a warp
+
+    using Shape   = ck_tile::ElementWiseShape<BlockWarps, BlockTile, WarpTile, XDataType>;
+    using Problem = ck_tile::ElementWisePipelineProblem<XDataType,
+                                                        XDataType, // ComputeDataType is same as
+                                                                   // XDataType in the unary case
+                                                        YDataType,
+                                                        Shape,
+                                                        XElementwiseOperation>;
+
+    using Kernel = ck_tile::ElementWiseKernel<Problem, ck_tile::ElementWiseDefaultPolicy>;
+
+    // Compute flattened size
+    ck_tile::index_t total_elements = 1;
+    for(auto d : shape)
+        total_elements *= d;
+
+    constexpr ck_tile::index_t kBlockSize =
+        ck_tile::get_warp_size() * BlockWarps::at(ck_tile::number<0>{});
+    constexpr ck_tile::index_t kBlockPerCu = 1;
+
+    constexpr ck_tile::index_t elements_per_block = BlockTile::at(ck_tile::number<0>{});
+    ck_tile::index_t kGridSize = (total_elements + elements_per_block - 1) / elements_per_block;
+
+    std::cout << "grid size = " << kGridSize << std::endl;
+    std::cout << "Total elements = " << total_elements << std::endl;
+
+    auto input_tensors = ck_tile::make_tuple(static_cast<XDataType*>(x_buf_a.GetDeviceBuffer()));
+    auto input_size    = ck_tile::make_tuple(M, N);
+
+    // Check if the kernel configuration is supported
+    if(!Kernel::IsSupportedArgument(input_size))
+    {
+        throw std::runtime_error(
+            "The kernel configuration is not supported for the given input size.");
+    }
+
+    // 4. Run the kernel
+    float ave_time = launch_kernel(ck_tile::stream_config{nullptr, true, 0, warmup, repeat},
+                                   ck_tile::make_kernel<kBlockSize, kBlockPerCu>(
+                                       Kernel{},
+                                       kGridSize,
+                                       kBlockSize,
+                                       0,
+                                       input_size,
+                                       ck_tile::make_tuple(N, 1), // Input Stride
+                                       ck_tile::make_tuple(N, 1), // Output Stride
+                                       input_tensors,
+                                       static_cast<YDataType*>(y_buf.GetDeviceBuffer())));
+
+    std::cout << "Average time: " << ave_time << " ms" << std::endl;
+
+    // 5. Verify the output
+    bool pass = true;
+    if(do_validation)
+    {
+        y_buf.FromDevice(y_validation.data());
+
+        auto op = [](const auto& v0) { return v0 * v0; };
+
+        ck_tile::reference_unary_elementwise<XDataType, YDataType, YDataType>(x_host_a, y_host, op);
+
+        pass = ck_tile::check_err(
+            y_validation, y_host, "Elementwise Add Error: Incorrect results!", 0.01, 0.01);
+    }
+
+    return pass;
+}
+
+int main(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    const std::string data_type = arg_parser.get_str("prec");
+    if(data_type == "fp16")
+    {
+        return run<ck_tile::half_t>(arg_parser) ? 0 : -2;
+    }
+
+    return -3;
+}
--- a/example/ck_tile/35_batched_transpose/README.md
+++ b/example/ck_tile/35_batched_transpose/README.md
@@ -24,4 +24,6 @@ args:
 -layout_out    output tensor data layout - NHWC by default
       -seed    seed to be used, -1 means random every time (default:-1)
     -k_name    t to 1 will print kernel name (default:0)
+     -warmup    warmup iterations to run this kernel (default:50)
+     -repeat    number of iterations to run this kernel (default:100)
 ```
--- a/example/ck_tile/35_batched_transpose/batched_transpose_api.cpp
+++ b/example/ck_tile/35_batched_transpose/batched_transpose_api.cpp
@@ -1,59 +1,132 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 #include "batched_transpose_example.hpp"
-#include <iostream>

-template <typename ts_type,
-          ck_tile::index_t block_x,
-          ck_tile::index_t block_y,
-          ck_tile::index_t warp_x,
-          ck_tile::index_t warp_y,
-          ck_tile::index_t thread_x,
-          ck_tile::index_t thread_y>
+namespace {
+
+template <int32_t pipeline_id>
+struct kernel_traits;
+
+template <>
+struct kernel_traits<0>
+{
+    template <typename ts_type, typename block_tile, typename warp_layout, bool kPadM, bool kPadN>
+    using Problem =
+        ck_tile::BatchedTransposeProblem<ts_type, block_tile, warp_layout, kPadM, kPadN>;
+    using Policy = ck_tile::BatchedTransposePolicy;
+    template <typename ts_type, typename block_tile, typename warp_layout, bool kPadM, bool kPadN>
+    using Pipeline =
+        ck_tile::BatchedTransposePipeline<Problem<ts_type, block_tile, warp_layout, kPadM, kPadN>,
+                                          Policy>;
+};
+
+template <>
+struct kernel_traits<1>
+{
+    template <typename ts_type, typename block_tile, typename warp_layout, bool kPadM, bool kPadN>
+    using Problem =
+        ck_tile::BatchedTransposeLdsProblem<ts_type, block_tile, warp_layout, kPadM, kPadN>;
+    using Policy = ck_tile::BatchedTransposeLdsPolicy;
+    template <typename ts_type, typename block_tile, typename warp_layout, bool kPadM, bool kPadN>
+    using Pipeline = ck_tile::BatchedTransposeLdsPipeline<
+        Problem<ts_type, block_tile, warp_layout, kPadM, kPadN>,
+        Policy>;
+};
+} // namespace
+
+template <typename InputType_,
+          ck_tile::index_t BlockX_,
+          ck_tile::index_t BlockY_,
+          ck_tile::index_t NumWarpsX_,
+          ck_tile::index_t NumWarpsY_,
+          bool PadM_,
+          bool PadN_,
+          ck_tile::index_t PipelineId_>
+struct BatchedTransposeConfig
+{
+    using InputType                               = InputType_;
+    static constexpr ck_tile::index_t kBlockX     = BlockX_;
+    static constexpr ck_tile::index_t kBlockY     = BlockY_;
+    static constexpr ck_tile::index_t kNumWarpsX  = NumWarpsX_;
+    static constexpr ck_tile::index_t kNumWarpsY  = NumWarpsY_;
+    static constexpr bool kPadM                   = PadM_;
+    static constexpr bool kPadN                   = PadN_;
+    static constexpr ck_tile::index_t kPipelineId = PipelineId_;
+};
+
+template <typename Config>
 float batched_transpose_dispatch(batched_transpose_kargs& a, ck_tile::stream_config& s)
 {
-    uint32_t dim_block_h = (a.height + block_y - 1) / block_y;
-    uint32_t dim_block_w = (a.width + block_x - 1) / block_x;
-    uint32_t dim_stride  = a.height * a.width;
+    uint32_t dim_stride = a.height * a.width;

    a.dim_stride  = dim_stride;
-    a.dim_block_h = dim_block_h;
-    a.dim_block_w = dim_block_w;
+    a.dim_block_h = Config::kBlockY;
+    a.dim_block_w = Config::kBlockX;

-    using block_tile  = ck_tile::sequence<block_x, block_y>;
-    using warp_tile   = ck_tile::sequence<warp_x, warp_y>;
-    using thread_tile = ck_tile::sequence<thread_x, thread_y>;
-
-    using ts_problem =
-        ck_tile::BatchedTransposeProblem<ts_type, block_tile, warp_tile, thread_tile>;
-    using ts_pipeline = ck_tile::BatchedTransposePipeline<ts_problem>;
-
-    using kernel = ck_tile::BatchedTransposeKernel<ts_pipeline>;
+    // TODO: this is fragile and slow to compile
+    using kernel = ck_tile::BatchedTransposeKernel<
+        typename kernel_traits<Config::kPipelineId>::template Pipeline<
+            typename Config::InputType,
+            ck_tile::sequence<Config::kBlockX, Config::kBlockY>,
+            ck_tile::sequence<Config::kNumWarpsX, Config::kNumWarpsY>,
+            Config::kPadM,
+            Config::kPadN>>;

    auto kargs = kernel::MakeKargs(a);

    const dim3 grids      = kernel::GridSize(a);
    constexpr dim3 blocks = kernel::BlockSize();

+    printf("Pipeline: %d\n", Config::kPipelineId);
+    printf("Grid: x=%u y=%u z=%u\n", grids.x, grids.y, grids.z);
+    printf("Block: x=%u y=%u z=%u\n", blocks.x, blocks.y, blocks.z);
+    printf(
+        "Host args: batch=%d, height=%d, width=%d, dim_stride=%d, dim_block_h=%d, dim_block_w=%d\n",
+        a.batch,
+        a.height,
+        a.width,
+        a.dim_stride,
+        a.dim_block_h,
+        a.dim_block_w);
+    printf("kargs: kargs.batch=%d kargs.height=%d kargs.width=%d kargs.dim_stride=%d\n",
+           kargs.batch,
+           kargs.height,
+           kargs.width,
+           kargs.dim_stride);
+
+    printf("Launching Kernel...\n");
+
    float ave_time = ck_tile::launch_kernel(
        s, ck_tile::make_kernel<blocks.x, 1>(kernel{}, grids, blocks, 0, kargs));

+    printf("Kernel finished...\n");
+
    return ave_time;
 }

-// Param Comb: type_size, block_x & y, warp_x & y, thread_x & y
-#define FOREACH_TRANSPOSE_PARAM(F)               \
-    F(fp16, ck_tile::fp16_t, 16, 16, 8, 8, 1, 1) \
-    F(bf16, ck_tile::bf16_t, 16, 16, 8, 8, 1, 1) \
-    F(fp32, ck_tile::fp32_t, 16, 16, 8, 8, 1, 1) \
-    F(int8, ck_tile::int8_t, 16, 16, 8, 8, 1, 1)
+// Param Comb: type_size, block_x & y, WarpNum_x & y
+#define FOREACH_TRANSPOSE_PARAM(F)                          \
+    F(fp8, ck_tile::fp8_t, 64, 64, 1, 1, true, true, 0)     \
+    F(fp8, ck_tile::fp8_t, 64, 64, 1, 1, false, false, 0)   \
+    F(fp16, ck_tile::fp16_t, 64, 64, 1, 1, true, true, 0)   \
+    F(fp16, ck_tile::fp16_t, 64, 64, 1, 1, false, false, 0) \
+    F(bf16, ck_tile::bf16_t, 64, 64, 1, 1, true, true, 0)   \
+    F(bf16, ck_tile::bf16_t, 64, 64, 1, 1, false, false, 0) \
+    F(fp8, ck_tile::fp8_t, 64, 64, 1, 1, true, true, 1)     \
+    F(fp8, ck_tile::fp8_t, 64, 64, 1, 1, false, false, 1)   \
+    F(fp16, ck_tile::fp16_t, 64, 64, 1, 1, true, true, 1)   \
+    F(fp16, ck_tile::fp16_t, 64, 64, 1, 1, false, false, 1) \
+    F(bf16, ck_tile::bf16_t, 64, 64, 1, 1, true, true, 1)   \
+    F(bf16, ck_tile::bf16_t, 64, 64, 1, 1, false, false, 1)

 // Macro that defines one static function per line
-#define GEN_TRANSPOSE_FN(SHORT_NAME, REAL_TYPE, BX, BY, WX, WY, TX, TY)               \
-    static float transpose_fn_##SHORT_NAME##_##BX##_##BY##_##WX##_##WY##_##TX##_##TY( \
-        batched_transpose_kargs& a, ck_tile::stream_config& s)                        \
-    {                                                                                 \
-        return batched_transpose_dispatch<REAL_TYPE, BX, BY, WX, WY, TX, TY>(a, s);   \
+#define GEN_TRANSPOSE_FN(SHORT_NAME, REAL_TYPE, BX, BY, WX, WY, PADM, PADN, PIPE)          \
+    static float                                                                           \
+        transpose_fn_##SHORT_NAME##_##BX##_##BY##_##WX##_##WY##_##PADM##_##PADN##_v##PIPE( \
+            batched_transpose_kargs& a, ck_tile::stream_config& s)                         \
+    {                                                                                      \
+        return batched_transpose_dispatch<                                                 \
+            BatchedTransposeConfig<REAL_TYPE, BX, BY, WX, WY, PADM, PADN, PIPE>>(a, s);    \
    }

 FOREACH_TRANSPOSE_PARAM(GEN_TRANSPOSE_FN)
@@ -62,21 +135,78 @@ float batched_transpose(batched_transpose_trait t,
                        batched_transpose_kargs a,
                        ck_tile::stream_config s)
 {
-    if(t.type == "fp16")
+    if(t.pipeline == "0")
    {
-        return transpose_fn_fp16_16_16_8_8_1_1(a, s);
+        if(t.type == "fp8")
+        {
+            if(a.height % 64 == 0 && a.width % 64 == 0)
+            {
+                return transpose_fn_fp8_64_64_1_1_false_false_v0(a, s);
+            }
+            else
+            {
+                return transpose_fn_fp8_64_64_1_1_true_true_v0(a, s);
+            }
+        }
+        else if(t.type == "fp16")
+        {
+            if(a.height % 64 == 0 && a.width % 64 == 0)
+            {
+                return transpose_fn_fp16_64_64_1_1_false_false_v0(a, s);
+            }
+            else
+            {
+                return transpose_fn_fp16_64_64_1_1_true_true_v0(a, s);
+            }
+        }
+        else if(t.type == "bf16")
+        {
+            if(a.height % 64 == 0 && a.width % 64 == 0)
+            {
+                return transpose_fn_bf16_64_64_1_1_false_false_v0(a, s);
+            }
+            else
+            {
+                return transpose_fn_bf16_64_64_1_1_true_true_v0(a, s);
+            }
+        }
    }
-    else if(t.type == "bf16")
+    else if(t.pipeline == "1")
    {
-        return transpose_fn_bf16_16_16_8_8_1_1(a, s);
-    }
-    else if(t.type == "fp32")
-    {
-        return transpose_fn_fp32_16_16_8_8_1_1(a, s);
-    }
-    else if(t.type == "int8")
-    {
-        return transpose_fn_int8_16_16_8_8_1_1(a, s);
+        if(t.type == "fp8")
+        {
+            if(a.height % 64 == 0 && a.width % 64 == 0)
+            {
+                return transpose_fn_fp8_64_64_1_1_false_false_v1(a, s);
+            }
+            else
+            {
+                return transpose_fn_fp8_64_64_1_1_true_true_v1(a, s);
+            }
+        }
+        else if(t.type == "fp16")
+        {
+            if(a.height % 64 == 0 && a.width % 64 == 0)
+            {
+                return transpose_fn_fp16_64_64_1_1_false_false_v1(a, s);
+            }
+            else
+            {
+                return transpose_fn_fp16_64_64_1_1_true_true_v1(a, s);
+            }
+        }
+        else if(t.type == "bf16")
+        {
+            if(a.height % 64 == 0 && a.width % 64 == 0)
+            {
+                return transpose_fn_bf16_64_64_1_1_false_false_v1(a, s);
+            }
+            else
+            {
+                return transpose_fn_bf16_64_64_1_1_true_true_v1(a, s);
+            }
+        }
    }
+
    return -1;
 }
--- a/example/ck_tile/35_batched_transpose/batched_transpose_example.cpp
+++ b/example/ck_tile/35_batched_transpose/batched_transpose_example.cpp
@@ -21,13 +21,13 @@ void dump_host_tensor_4d(const ck_tile::HostTensor<T>& x)
    std::cout << "[";
    for(size_t i = 0; i < len[0]; i++)
    {
-        std::cout << i << ": [";
+        std::cout << "Batch " << i << ":" << std::endl;
        for(size_t j = 0; j < len[1]; j++)
        {
-            std::cout << j << ": [";
+            std::cout << "  Channel " << j << ":" << std::endl;
            for(size_t k = 0; k < len[2]; k++)
            {
-                std::cout << k << ": [";
+                std::cout << "    Row " << k << ": ";
                for(size_t v = 0; v < len[3]; v++)
                {
                    if constexpr(std::is_same_v<T, ck_tile::fp16_t>)
@@ -41,15 +41,15 @@ void dump_host_tensor_4d(const ck_tile::HostTensor<T>& x)
                    }
                    else
                    {
-                        std::cout << x(std::vector<std::size_t>{i, j, k, v}) << " ";
+                        std::cout << static_cast<int>(x(std::vector<std::size_t>{i, j, k, v}))
+                                  << " ";
                    }
                }
-                std::cout << "]" << std::endl;
+                std::cout << std::endl;
            }
-            std::cout << "]" << std::endl;
        }
-        std::cout << std::endl;
    }
+    std::cout << "]" << std::endl;
    std::cout << "--------------------" << std::endl;
 }
 #endif
@@ -93,14 +93,17 @@ auto create_args(int argc, char* argv[])
    ck_tile::ArgParser arg_parser;
    arg_parser.insert("v", "1", "whether do CPU validation or not")
        .insert("pr", "fp16", "input data type. fp16/fp32 (representing 8/16/32 bit data)")
-        .insert("N", "2", "input batch size. ")
-        .insert("C", "16", "input channel size.")
-        .insert("H", "1", "input height size.")
-        .insert("W", "16", "input width size. ")
+        .insert("N", "1", "input batch size. ")
+        .insert("C", "64", "input channel size.")
+        .insert("H", "18", "input height size.")
+        .insert("W", "64", "input width size. ")
        .insert("layout_in", "NCHW", "input tensor data layout - NCHW by default")
        .insert("layout_out", "NHWC", "output tensor data layout - NHWC by default ")
+        .insert("warmup", "50", "number of iterations before benchmark the kernel")
+        .insert("repeat", "100", "number of iterations to benchmark the kernel")
        .insert("seed", "-1", "seed to be used, -1 means random every time")
-        .insert("kname", "0", "t to 1 will print kernel name");
+        .insert("kname", "0", "t to 1 will print kernel name")
+        .insert("pipeline", "0", "0: no LDS usage, 1: LDS-accelerated (gfx950)");

    bool result = arg_parser.parse(argc, argv);
    return std::make_tuple(result, arg_parser);
@@ -115,8 +118,11 @@ bool run_batched_transpose(ck_tile::ArgParser args)
    int C                  = args.get_int("C");
    int H                  = args.get_int("H");
    int W                  = args.get_int("W");
+    int n_warmup           = args.get_int("warmup");
+    int n_repeat           = args.get_int("repeat");
    std::string layout_in  = args.get_str("layout_in");
    std::string layout_out = args.get_str("layout_out");
+    std::string pipeline   = args.get_str("pipeline");
    int seed               = args.get_int("seed");

    int dim_in[4], dim_out[4];
@@ -162,7 +168,7 @@ bool run_batched_transpose(ck_tile::ArgParser args)

    x_dev.ToDevice(x_host.data());

-    auto trait = batched_transpose_trait{prec, layout_in};
+    auto trait = batched_transpose_trait{prec, layout_in, pipeline};

    uint32_t height = nchw2nhwc ? C : H * W;
    uint32_t width  = nchw2nhwc ? H * W : C;
@@ -177,21 +183,19 @@ bool run_batched_transpose(ck_tile::ArgParser args)
        return a_;
    }();

-    ck_tile::stream_config sc{nullptr, true};
+    ck_tile::stream_config sc{nullptr, true, n_warmup, n_repeat};

    auto ms = batched_transpose(trait, karg, sc);

-    std::size_t num_operations = N * C * H * (W - 1);
-    std::size_t num_bytes      = N * C * H * W * sizeof(Type);
+    std::size_t num_bytes = N * C * H * W * sizeof(Type) * 2; // read + written

-    float ave_time   = ms * 1E-3;
    float gb_per_sec = num_bytes / ms * 1.E-6;
-    float tflops     = static_cast<float>(num_operations) / ms * 1.E-6;

    std::cout << "Run Batched Transpose kernel with N=" << N << ", C=" << C << ", H=" << H
              << ", W=" << W << ", layout_in=" << layout_in << ", layout_out=" << layout_out
-              << " : " << ms << " ms (" << ave_time << " ave_time), " << tflops << " TFlops"
-              << gb_per_sec << " GB/s, " << std::endl;
+              << " : " << std::endl
+              << ms << " ms " << std::endl
+              << gb_per_sec << " GB/s " << std::endl;

    printf("[%s]N:%d, C:%d, H:%d, W:%d, layout_in:%s, %f\n",
           prec.c_str(),
@@ -202,7 +206,8 @@ bool run_batched_transpose(ck_tile::ArgParser args)
           layout_in.c_str(),
           ms);
    if(ms < 0)
-        printf("not supported\n");
+        printf("------------------------------------not "
+               "supported-------------------------------------\n");
    fflush(stdout);

    if(ms < 0)
@@ -227,7 +232,9 @@ bool run_batched_transpose(ck_tile::ArgParser args)
        rtn &= ck_tile::check_err(
            y_host, y_ref, std::string("y Error: Incorrect results!"), rtol, atol);
    }
-    printf("valid:%s\n", rtn ? "y" : "n");
+    printf("-----------------------------------------------------------------------valid:%s--------"
+           "--------------------------------------------------------------------\n",
+           rtn ? "y" : "n");
    fflush(stdout);
    return rtn;
 }
@@ -240,9 +247,9 @@ int main(int argc, char** argv)
    std::string prec = args.get_str("pr");

    bool r = true;
-    if(prec.compare("fp32") == 0)
+    if(prec.compare("fp8") == 0)
    {
-        r &= run_batched_transpose<float>(args);
+        r &= run_batched_transpose<ck_tile::fp8_t>(args);
    }
    else if(prec.compare("fp16") == 0)
    {
@@ -252,10 +259,6 @@ int main(int argc, char** argv)
    {
        r &= run_batched_transpose<ck_tile::bf16_t>(args);
    }
-    else if(prec.compare("int8") == 0)
-    {
-        r &= run_batched_transpose<ck_tile::int8_t>(args);
-    }

    return r ? 0 : -1;
 }
--- a/example/ck_tile/35_batched_transpose/batched_transpose_example.hpp
+++ b/example/ck_tile/35_batched_transpose/batched_transpose_example.hpp
@@ -14,6 +14,7 @@ struct batched_transpose_trait
 {
    std::string type;
    std::string layout;
+    std::string pipeline;
 };

 struct batched_transpose_kargs : public ck_tile::BatchedTransposeHostArgs
--- a/example/ck_tile/35_batched_transpose/script/perf_test.sh
+++ b/example/ck_tile/35_batched_transpose/script/perf_test.sh
@@ -0,0 +1,18 @@
+#!/bin/sh
+
+# Copyright © Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier:  MIT
+
+EXE=./build/bin/tile_example_batched_transpose
+
+for C in "64" "256" "1024" "4096" "16384"; do
+for W in "64" "256" "1024" "4096" "16384"; do
+for pr in "fp8" "fp16" "bf16"; do
+for pipeline in "0" "1"; do
+
+$EXE -pipeline=$pipeline -pr=$pr -N=1 -C=$C -H=1 -W=$W -layout_in='NCHW' -layout_out='NHWC'
+
+done
+done
+done
+done
--- a/Show More
+++ b/Show More
				`@@ -1,2 +1 @@`
				`add_executable(tile_example_grouped_gemm EXCLUDE_FROM_ALL grouped_gemm.cpp)`