Export ROCm/rocm-libraries@2d4a3223cb

2026-05-05 14:11:29 +00:00 · 2026-03-11 23:03:20 -04:00
commit e6cd3f1e3f
6330 changed files with 1132789 additions and 0 deletions
--- a/example/ck_tile/01_fmha/CMakeLists.txt
+++ b/example/ck_tile/01_fmha/CMakeLists.txt
@@ -0,0 +1,215 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+set(INST_TARGETS ${SUPPORTED_GPU_TARGETS})
+# Currently only gfx9 and gfx12 archs are supported by FMHA
+list(FILTER INST_TARGETS INCLUDE REGEX "gfx9|gfx1[12]")
+if(NOT INST_TARGETS)
+  message(WARNING "Skipping Tile Engine FMHA compilation: No supported GPU targets (gfx9, gfx11, gfx12) found in SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}")
+  return()
+endif()
+
+# validate user-specified fmha_fwd API list
+set(FMHA_FWD_KNOWN_APIS "fwd;fwd_splitkv;fwd_appendkv;pagedkv_prefill")
+set(FMHA_FWD_ENABLE_APIS "fwd" CACHE STRING
+  "semicolon-separated list of APIs to generate (${FMHA_FWD_KNOWN_APIS}) & link, or \"all\".")
+if(BUILD_TESTING)
+  # Build instances of all APIs for tests
+  message(DEBUG "Enabling all FWD APIs of CK Tile FMHA for because testing is enabled")
+  set(FMHA_FWD_ENABLE_APIS "all")
+endif()
+if(FMHA_FWD_ENABLE_APIS STREQUAL "all")
+  set(FMHA_FWD_ENABLE_APIS ${FMHA_FWD_KNOWN_APIS})
+endif()
+
+foreach(api ${FMHA_FWD_ENABLE_APIS})
+  if(NOT "${api}" IN_LIST FMHA_FWD_KNOWN_APIS)
+    message(FATAL_ERROR "${api} isn't a known api: ${FMHA_FWD_KNOWN_APIS}.")
+  endif()
+endforeach()
+
+# "fwd" is a must-have api for the fmha_fwd example, add it if not specified
+if(NOT "fwd" IN_LIST FMHA_FWD_ENABLE_APIS)
+  list(PREPEND FMHA_FWD_ENABLE_APIS "fwd")
+endif()
+
+file(GLOB_RECURSE CODE_GEN_SCRIPTS CONFIGURE_DEPENDS
+  ${CMAKE_CURRENT_LIST_DIR}/generate.py
+  ${CMAKE_CURRENT_LIST_DIR}/codegen/*.py
+)
+# re-run execute_process `generate.py --list_blobs` if any of the codegen scripts change
+set_directory_properties(PROPERTIES CMAKE_CONFIGURE_DEPENDS "${CODE_GEN_SCRIPTS}")
+
+list(JOIN INST_TARGETS , FMHA_TARGETS_ARG)
+
+string(REPLACE ";" "," FMHA_FWD_APIS "${FMHA_FWD_ENABLE_APIS}")
+set(FMHA_FWD_CODE_GEN_COMMON_ARGS
+  ${CMAKE_CURRENT_LIST_DIR}/generate.py
+  --targets ${FMHA_TARGETS_ARG}
+  --api ${FMHA_FWD_APIS}
+  --optdim 32,64,80,128,256
+  # --filter fmha_fwd...
+)
+set(FMHA_BWD_CODE_GEN_COMMON_ARGS
+  ${CMAKE_CURRENT_LIST_DIR}/generate.py
+  --targets ${FMHA_TARGETS_ARG}
+  --api bwd
+  --receipt 3
+  --optdim 32,64,96,128,256
+  # --filter fmha_bwd_dot...@fmha_bwd_convert...@fmha_bwd...
+)
+
+# Reduce building time by disabling instances that are not currently used in the gtests
+# TODO: Consider to use a special receipt for testing only, or even two receipts: a small subset of
+# instances for quick CI runs and a larger subset for scheduled runs (the tests skip tests when
+# there is no corresponding instance for parameters).
+if(BUILD_TESTING)
+  # Filters are in the order of FMHA_FWD_KNOWN_APIS: fwd,fwd_splitkv_combine@fwd_splitkv,fwd_appendkv,pagedkv_prefill
+  list(APPEND FMHA_FWD_CODE_GEN_COMMON_ARGS --filter *_nlogits*_nskip*_nsink*,*@*_nlogits*_nbias*_nsink*,*,*_nlogits*_nskip*_pagedkv*)
+endif()
+
+# generate a list of kernels, but not actually emit files at config sta
+execute_process(
+  COMMAND ${Python3_EXECUTABLE} ${FMHA_FWD_CODE_GEN_COMMON_ARGS}
+  --list_blobs ${CMAKE_CURRENT_BINARY_DIR}/fwd_blob_list.txt
+  RESULT_VARIABLE ret
+)
+if(ret AND NOT ret EQUAL 0)
+  message(FATAL_ERROR "CK Tile FMHA FAILED to generate a list of FWD kernels via Python.")
+endif()
+
+execute_process(
+  COMMAND ${Python3_EXECUTABLE} ${FMHA_BWD_CODE_GEN_COMMON_ARGS}
+  --list_blobs ${CMAKE_CURRENT_BINARY_DIR}/bwd_blob_list.txt
+  RESULT_VARIABLE ret
+)
+if(ret AND NOT ret EQUAL 0)
+  message(FATAL_ERROR "CK Tile FMHA FAILED to generate a list of BWD kernels via Python.")
+endif()
+
+# NOTE: for cmake, the FMHA_FWD_GEN_BLOBS/FMHA_BWD_GEN_BLOBS files must be in the same directory
+#       as current cmake list, otherwise will not figure out the dependency properly
+file(STRINGS ${CMAKE_CURRENT_BINARY_DIR}/fwd_blob_list.txt FMHA_FWD_GEN_BLOBS)
+file(STRINGS ${CMAKE_CURRENT_BINARY_DIR}/bwd_blob_list.txt FMHA_BWD_GEN_BLOBS)
+
+add_custom_command(
+  OUTPUT ${FMHA_FWD_GEN_BLOBS}
+  COMMAND ${Python3_EXECUTABLE} ${FMHA_FWD_CODE_GEN_COMMON_ARGS}
+  --output_dir ${CMAKE_CURRENT_BINARY_DIR}
+  DEPENDS ${CODE_GEN_SCRIPTS}
+  COMMENT "Generate CK Tile FMHA FWD kernels"
+)
+
+add_custom_command(
+  OUTPUT ${FMHA_BWD_GEN_BLOBS}
+  COMMAND ${Python3_EXECUTABLE} ${FMHA_BWD_CODE_GEN_COMMON_ARGS}
+  --output_dir ${CMAKE_CURRENT_BINARY_DIR}
+  DEPENDS ${CODE_GEN_SCRIPTS}
+  COMMENT "Generate CK Tile FMHA BWD kernels"
+)
+
+set(FMHA_FWD_INSTANCES "tile_fmha_fwd_instances")
+set(FMHA_BWD_INSTANCES "tile_fmha_bwd_instances")
+
+message(DEBUG "adding instances ${FMHA_FWD_INSTANCES}")
+# to save build time, exclude the target from "all" target of "01_fmha" directory and its ancestors
+add_library(${FMHA_FWD_INSTANCES} OBJECT EXCLUDE_FROM_ALL)
+target_include_directories(${FMHA_FWD_INSTANCES} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
+target_sources(${FMHA_FWD_INSTANCES} PRIVATE ${FMHA_FWD_GEN_BLOBS})
+set_source_files_properties(${FMHA_FWD_GEN_BLOBS} PROPERTIES LANGUAGE HIP)
+set_property(TARGET ${FMHA_FWD_INSTANCES} PROPERTY HIP_ARCHITECTURES ${INST_TARGETS})
+
+message(DEBUG "adding instances ${FMHA_BWD_INSTANCES}")
+add_library(${FMHA_BWD_INSTANCES} OBJECT EXCLUDE_FROM_ALL)
+target_include_directories(${FMHA_BWD_INSTANCES} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
+target_sources(${FMHA_BWD_INSTANCES} PRIVATE ${FMHA_BWD_GEN_BLOBS})
+set_source_files_properties(${FMHA_BWD_GEN_BLOBS} PROPERTIES LANGUAGE HIP)
+set_property(TARGET ${FMHA_BWD_INSTANCES} PROPERTY HIP_ARCHITECTURES ${INST_TARGETS})
+
+set(FMHA_FWD_PRIVATE_COMPILE_OPTIONS)
+set(FMHA_BWD_PRIVATE_COMPILE_OPTIONS)
+set(FMHA_FWD_INTERFACE_COMPILE_OPTIONS)
+set(FMHA_BWD_INTERFACE_COMPILE_OPTIONS)
+
+# NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
+#       ... because they are auto-generated
+list(APPEND FMHA_FWD_PRIVATE_COMPILE_OPTIONS -Wno-undefined-func-template)
+list(APPEND FMHA_BWD_PRIVATE_COMPILE_OPTIONS -Wno-undefined-func-template)
+
+# Allow comparing floating points directly in order to check sentinel values
+list(APPEND FMHA_FWD_PRIVATE_COMPILE_OPTIONS -Wno-float-equal)
+list(APPEND FMHA_BWD_PRIVATE_COMPILE_OPTIONS -Wno-float-equal)
+
+# NOTE: this is dangerous since will change the whole kernel to flush denormals
+#       WIP with compiler team for an exp2 intrinsic..., then remove this
+if(NOT DEFINED FMHA_FWD_FAST_EXP2)
+  set(FMHA_FWD_FAST_EXP2 ON)
+endif()
+
+if(FMHA_FWD_FAST_EXP2)
+  list(APPEND FMHA_FWD_PRIVATE_COMPILE_OPTIONS -DCK_TILE_FMHA_FWD_FAST_EXP2=1 -fgpu-flush-denormals-to-zero)
+else()
+  list(APPEND FMHA_FWD_PRIVATE_COMPILE_OPTIONS -DCK_TILE_FMHA_FWD_FAST_EXP2=0)
+endif()
+list(APPEND FMHA_BWD_PRIVATE_COMPILE_OPTIONS -fgpu-flush-denormals-to-zero)
+
+# conditionally enable call to the fwd_splitkv API in fmha_fwd example and tests
+if("fwd_splitkv" IN_LIST FMHA_FWD_ENABLE_APIS)
+  list(APPEND FMHA_FWD_INTERFACE_COMPILE_OPTIONS -DCK_TILE_FMHA_FWD_SPLITKV_API=1)
+else()
+  list(APPEND FMHA_FWD_INTERFACE_COMPILE_OPTIONS -DCK_TILE_FMHA_FWD_SPLITKV_API=0)
+endif()
+
+# conditionally enable call to the fwd_appendkv API in fmha_fwd example and tests
+if("fwd_appendkv" IN_LIST FMHA_FWD_ENABLE_APIS)
+  list(APPEND FMHA_FWD_INTERFACE_COMPILE_OPTIONS -DCK_TILE_FMHA_FWD_APPENDKV_API=1)
+else()
+  list(APPEND FMHA_FWD_INTERFACE_COMPILE_OPTIONS -DCK_TILE_FMHA_FWD_APPENDKV_API=0)
+endif()
+
+# conditionally enable call to the pagedkv_prefill API in fmha_fwd example and tests
+if("pagedkv_prefill" IN_LIST FMHA_FWD_ENABLE_APIS)
+  list(APPEND FMHA_FWD_INTERFACE_COMPILE_OPTIONS -DCK_TILE_FMHA_FWD_PAGEDKV_API=1)
+else()
+  list(APPEND FMHA_FWD_INTERFACE_COMPILE_OPTIONS -DCK_TILE_FMHA_FWD_PAGEDKV_API=0)
+endif()
+
+# conditionally specify the use of OCP_FP8
+if(CK_USE_OCP_FP8)
+  list(APPEND FMHA_FWD_PRIVATE_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
+  list(APPEND FMHA_FWD_INTERFACE_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
+endif()
+
+# use RTN_ASM on float to bfloat16 conversion by default, align with FA upstream
+list(APPEND FMHA_BWD_PRIVATE_COMPILE_OPTIONS -DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=3)
+list(APPEND FMHA_BWD_INTERFACE_COMPILE_OPTIONS -DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=3)
+
+target_compile_options(${FMHA_FWD_INSTANCES}
+  PRIVATE ${FMHA_FWD_PRIVATE_COMPILE_OPTIONS}
+  INTERFACE ${FMHA_FWD_INTERFACE_COMPILE_OPTIONS})
+target_compile_options(${FMHA_BWD_INSTANCES} 
+  PRIVATE ${FMHA_BWD_PRIVATE_COMPILE_OPTIONS}
+  INTERFACE ${FMHA_BWD_INTERFACE_COMPILE_OPTIONS})
+
+set(EXAMPLE_FMHA_FWD "tile_example_fmha_fwd")
+set(EXAMPLE_FMHA_BWD "tile_example_fmha_bwd")
+
+message(DEBUG "adding example ${EXAMPLE_FMHA_FWD}")
+# not using add_example_executable() to add this target, since we don't want this to be included in
+# "make all/install/check"
+add_executable(${EXAMPLE_FMHA_FWD} EXCLUDE_FROM_ALL example_fmha_fwd.cpp)
+target_link_libraries(${EXAMPLE_FMHA_FWD} ${FMHA_FWD_INSTANCES})
+target_include_directories(${EXAMPLE_FMHA_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
+
+message(DEBUG "adding example ${EXAMPLE_FMHA_BWD}")
+# not using add_example_executable() to add this target, since we don't want this to be included in
+# "make all/install/check"
+add_executable(${EXAMPLE_FMHA_BWD} EXCLUDE_FROM_ALL example_fmha_bwd.cpp)
+target_link_libraries(${EXAMPLE_FMHA_BWD} ${FMHA_BWD_INSTANCES})
+target_include_directories(${EXAMPLE_FMHA_BWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
+
+# TODO: we have to turn off this global prop, otherwise the progress bar generated
+# by cmake will print too many files, execvp: /bin/sh: Argument list too long
+# however, this property may affect global
+# TODO: consider codegen a makefile by us
+set_property(GLOBAL PROPERTY RULE_MESSAGES OFF)
--- a/example/ck_tile/01_fmha/README.md
+++ b/example/ck_tile/01_fmha/README.md
@@ -0,0 +1,167 @@
+# fused multi-head attention
+
+This folder contains example for fmha(fused multi-head attention) using ck_tile tile-programming implementation. It is a good example to demonstrate the usage of tile-programming API, as well as illustrate the new approach to construct a kernel template and instantiate it(them) while keeping compile time fast.
+
+## build
+```
+# 1. In the root of composable_kernel project, create the build directory.
+[~/composable_kernel] mkdir build && cd build
+# 2. In the build directory, run the CMake wrapper script to generate the build system files. Replace <arch> with the gfx architectures string.
+[~/composable_kernel/build] ../script/cmake-ck-dev.sh .. <arch> -G Ninja
+# 3. In the build directory, run the build system recipe.
+[~/composable_kernel/build] ninja tile_example_fmha_fwd
+```
+Running the build recipe will produce the executable `tile_example_fmha_fwd`.
+
+The executables reside in `bin` subdirectory of the build directory.
+
+This example provides recipes for `tile_example_fmha_fwd`, `tile_example_fmha_bwd`, `tile_example_fmha_fwd_v3`.
+
+> [!NOTE]
+> `cmake-ck-dev.sh` is a CMake wrapper.
+>
+> The first argument is the path to composable_kernel sources.
+>
+> The second argument is the gfx architectures string (e.g. "gfx950" or "gfx90a;gfx942").
+>
+> The remaining arguments are optional and are passed through to CMake.
+> E.g. `-G Ninja` specifies ninja as the build system.
+
+## kernel
+The kernel template is `fmha_fwd_kernel.hpp`, this is the grid-wise op in old ck_tile's terminology. We put it here purposely, to demonstrate one can construct a kernel by using various internal component from ck_tile. We may still have an implementation under ck_tile's include path (in the future) for the kernel template.
+
+There are 2 template parameters for this kernel template.
+* `FmhaPipeline` is one of the block_tile_pipeline(under `include/ck_tile/tile_program/block_tile_pipeline`) which is a performance critical component. Indeed, we did a lot of optimization and trials to optimize the pipeline and may still workout more performance pipeline and update into that folder. People only need to replace this pipeline type and would be able to enjoy the benefit of different performant implementations (stay tuned for updated pipeline(s)).
+* `EpiloguePipeline` will modify and store out the result in the last phase. People usually will do lot of post-fusion at this stage, so we also abstract this concept. Currently we didn't do much thing at the epilogue stage but leave the room for future possible support.
+
+## codegen
+To speed up compile time, we instantiate the kernels into separate file. In this way we can benefit from parallel building from CMake/Make system. This is achieved by `generate.py` script. Besides, you can look into this script to learn how to instantiate a kernel instance step by step, which is described in `FMHA_FWD_KERNEL_BODY` variable.
+
+## executable
+`tile_example_fmha_fwd` is the example executable, implemented in `fmha_fwd.cpp`. You can type `./bin/tile_example_fmha_fwd -?` to list all the arguments. Below is an example of the output (may subject to change)
+```
+args:
+          -v    weather do CPU validation or not (default:1)
+       -mode    kernel mode. 0:batch, 1:group (default:0)
+          -b    batch size (default:2)
+          -h    num of head, for q (default:8)
+        -h_k    num of head, for k/v, -1 means equal to h (default:-1)
+                if not equal to h, then this is GQA/MQA case
+          -s    seqlen_q. if group-mode, means the average value of seqlen_q (default:3328)
+                total_seqlen_q = seqlen_q * batch, and seqlen_q per batch may vary
+                also with "-s=s0,s1,s2..." comma seperated int to set per batch seqlen(group-mode)
+        -s_k    seqlen_k (including new key/value), -1 means equal to s (default:-1)
+                also with "-s_k=s0,s1,s2..." comma-separated ints to set seqlen per batch (group mode)
+     -s_qpad    seqlen_q stride between 2 batches (group-mode optional) (default:-1)
+                Provide positive strides per-batch to simulate physical padding on Q
+     -s_kpad    seqlen_k stride between 2 batches, currently used in group-mode only  (default:-1)
+                for kv-cache case, each batch [1,s,h,d]/[1,h,s,d] can have a stride
+                along seqlen, instead of packed, same as xformer kv_padding,
+                must be greater than or equal to s_k
+          -d    head dim for q, k (default:128)
+        -d_v    head dim for v, -1 means equal to d (default:-1)
+    -scale_s    scale factor of S. 0 means equal to 1/sqrt(hdim). (default:0)
+     -qscale    n or 0, no scaling (default:n)
+                1: per-tensor quantization.
+      -iperm    permute input (default:1)
+                if true, will be b*h*s*d, else b*s*h*d
+      -operm    permute output (default:1)
+       -bias    n or 0, no bias (default:n)
+                e(lementwise) or 1, elementwise bias with 1*1*s*s. e:1, 1*h*s*s. e:2, b*h*s*s
+                a(libi) or 2, alibi with 1*h. a:1, b*h
+       -prec    data type. fp16/bf16/fp8/bf8 (default:fp16)
+       -mask    0: no mask, 1: top-left(same as 't'), 2:bottom-right(same as 'b') (default:0)
+                't', top-left causal mask, 'b', bottom-r causal mask
+                't:l,r', top-left sliding window attn(swa) with FA style left right size
+                'b:l,r', bottom-r sliding window attn(swa) with FA style left right size
+                'xt:window_size', xformer style masking from top-left, window_size negative is causal, positive is swa
+                'xb:window_size', xformer style masking from bottom-r, window_size negative is causal, positive is swa
+                'g:y,x', generic attention mask coordinate with y/x size (only debug purpose for now)
+    -vlayout    r for row-major(seqlen*hdim), c for col-major(hdim*seqlen) (default:r)
+        -lse    0 not store lse, 1 store lse (default:0)
+      -kname    if set to 1 will print kernel name (default:0)
+       -init    init method. ui, uniform random int, ni, normalized random int (default:uf)
+                uf, uniform random float, nf, normalized random float, tf, trig float, uf:q, quantization
+       -seed    random seed used for initializing input tensors. 0 for non-deterministic seed (default:11939)
+  -drop_seed    seed for random number generator (default:1)
+-drop_offset    offset for random number generator (default:0)
+ -drop_prefs    seed and offset values are present on GPU; 0 - host, 1 - device/GPU (default:0)
+ -num_splits    number of splits for key/value. 0 to determine actual number by heuristic (default:1)
+     -warmup    number of iterations before benchmark the kernel (default:5)
+     -repeat    number of iterations to benchmark the kernel (default:20)
+       -json    0: No Json, 1: Dump Results in Json format (default:0)
+   -jsonfile    json file name to dump results (default:fmha_fwd.json)
+ -q_eff_lens    Batch-mode only: per-batch effective seqlen for Q (exclude PAD) (default:"")
+                Comma-separated list of length 'b'. If empty, no override
+-kv_eff_lens    Batch-mode only: per-batch effective seqlen for KV (exclude PAD) (default:"")
+                Comma-separated list of length 'b'. If empty, no override
+```
+Example 1: `./bin/tile_example_fmha_fwd -b=1 -h=16 -s=16384 -d=128` will run a fmha case with batch=1, nhead=16, sequence length=16384, hdim=128, fp16 case.
+Example 2: `./bin/tile_example_fmha_fwd -b=1 -h=8 -s=16384 -d=64 -drop_prefs=1 -drop_seed=10 -drop_offset=1234` will run a fmha case with
+  batch=1, nhead=8, sequence length=16384, hdim=64, drop_seed=0 (in GPU memory), drop_offset=1234 (in GPU memory) fp16 case
+
+## Padding Examples
+Example 3 (Group mode with padding): `./bin/tile_example_fmha_fwd -mode=1 -b=2 -h=8 -s=1024,2048 -s_k=1024,2048 -s_qpad=1536,3072 -s_kpad=1536,3072 -d=128` will run group mode with 2 batches having different sequence lengths (1024, 2048) but physically padded to (1536, 3072) respectively.
+
+Example 4 (Batch mode with effective lengths): `./bin/tile_example_fmha_fwd -mode=0 -b=2 -h=8 -s=2048 -s_k=2048 -d=128 -q_eff_lens=1024,1536 -kv_eff_lens=1024,1536` will run batch mode where all batches use 2048 as physical sequence length but have effective lengths of (1024, 1536) for Q and KV respectively.
+
+## support features
+Currently we are still in rapid development stage, so more features/optimizations will be coming soon.
+
+### hdim
+Currently we support `32/64/128/256` hdim for `fp16`/`bf16`, within which `64`/`128` is better optimized. hdim should be multiple of 8, while seqlen_s can be arbitrary. For hdim be arbitrary number, it can be support through padding kernel of `qr` pipeline (we didn't generate this in generate.py by default)
+
+### group/batch mode
+Currently we support both `batch mode` and `group mode` (or `varlen`, in FA's term), by setting `-mode` = `0` or `1`. In `group mode` different kind of attention mask is also supported(see below)
+
+### MQA/GQA
+By setting `-h`(nhead for q) and `-h_k`(nhead for k/v) with different number, you can achieve MQA/GQA. Please pay attention that `h % h_K == 0` when you set different numbers.
+
+### input/output permute, and `b*s*3*h*d`
+If you look at the kernel argument inside `fmha_fwd_kernel.hpp`, we support providing arbitrary stride for seqlen(stride_q/k/v), nhead, batch of q/k/v matrix, hence it is very flexible to support `b*h*s*d` or `b*s*h*d` input/output permute. The `-iperm=0/1`, `-operm=0/1` is a convenient way to achieve this through the executable. We didn't provide a command-line arg to test `b*s*3*h*d` layout which is by default used by torch/FA, but it's trivial to achieve this if one set the proper `stride_q/k/v` value as `3*h*d`.
+
+### attention bias
+Attention bias is supported with the layout of `1*1*s*s`(similiar to input/output, different layout can be supported by changing the stride value for bias, or even extend to `b*h*s*s`) and bias value in float number.
+
+### alibi
+alibi is supported
+
+### lse
+For training kernels, "log sum exp" need to store out in forward and used in backward. We support this by setting `-lse=1`
+
+### vlayout
+We support v matrix in both row-major(`seqlen*hdim`) and col-major(`hdim*seqlen`). Since the accumulate(reduce) dimension for V is along `seqlen`, for current AMD's mfma layout which expect each thread to have contiguous register holding pixels along reduce dimension, it's easier to support col-major V layout. However, the performance of col-major is not necessarily faster than row-major, there are many factors that may affect the overall performance. We still provide the `-vlayout=r/c` here to switch/test between different layouts.
+
+### attention mask
+we support `causal mask` and `sliding window attention(swa)` mask in both batch and group mode, either from top-left or bottom-right.
+Underneath, we unify the mask expression into `generic attention mask coordinate`, providing an uniformed approach for each batch to locate the corresponding pixel need to be masked out.
+![](misc/gamc.png)
+
+Since FA/xformer style with window_size_left/right is more popular, we accept window_size as parameter and convert that internally to our generic coordinate(this coordinate can express more cases). Below shows some example of how to achieve different kind of mask through cmdline.
+
+| mask case|  cmdline    | FA style | xformer style |
+|----------|:-------------:|:-------------:|:-------------:|
+| no mask |  `-mask=0`(default) | | |
+| causal mask from top-left | `-mask=1` or `-mask=t` | `-mask=t:-1,0` | `-mask=xt:-1` |
+| causal mask from bottom-right | `-mask=2` or `-mask=b` | `-mask=b:-1,0` | `-mask=xb:-1` |
+| swa from top-left | | `-mask=t:3,5` | `-mask=xt:4` |
+| swa from bottom-right | |  `-mask=b:10,11` | `-mask=xb:16` |
+
+Note FA use bottom-right by default to express swa case, here we require you explicitly specify top-left/bottom-right.
+
+### dropout
+TBD
+
+### sequence padding and variable length support
+We support sequence padding and variable-length processing in both batch and group modes fmha forward to handle real-world scenarios where sequences have different lengths.
+
+**Group Mode Padding**: Use `-s_qpad` and `-s_kpad` to specify physical stride between batches, enabling padded layouts. Each batch can have different logical sequence lengths (`-s`, `-s_k`) but use larger physical strides for memory alignment.
+
+**Batch Mode Variable Length**: Use `-q_eff_lens` and `-kv_eff_lens` to specify effective sequence lengths per batch. All batches share the same physical sequence length, but the kernel processes only the effective portions. This enables efficient variable-length attention without memory waste.
+
+Both approaches optimize memory access patterns while supporting flexible sequence length requirements commonly found in transformer inference scenarios.
+
+## FP8 experimental support
+As described in [this blog](https://blog.hippoml.com/8bit-hippoattention-up-to-3x-faster-compared-to-flashattentionv2-8f9def90b482), we have an experimental support for fp8 fmha kernels, you can evaluate the performance by setting the arg `-prec=fp8` to the `tile_example_fmha_fwd`, on a gfx942 machine and ROCm 6.0+.
+
+Currently we only support `-vlayout=r`( `seqlen*hdim` for V matrix)  for fp8 and fp8bf16 now. Full feature support will come later.
--- a/example/ck_tile/01_fmha/bias.hpp
+++ b/example/ck_tile/01_fmha/bias.hpp
@@ -0,0 +1,114 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <ostream>
+#include <string>
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/fmha.hpp"
+
+// keep sync with BlockAttentionBiasEnum
+enum class bias_enum
+{
+    no_bias          = 0,
+    elementwise_bias = 1,
+    alibi            = 2,
+};
+
+struct bias_info
+{
+    bias_enum type;
+    /*
+     * simple dispatch logic
+     *
+     * if type == elementwise_bias:
+     *      if rank_info == 0:
+     *           bias is 1*1*s*s
+     *      elif rank_info == 1:
+     *           bias is 1*h*s*s
+     *      elif rank_info == 2:
+     *           bias is b*h*s*s
+     *
+     * elif type == alibi:
+     *       if rank_info == 0:
+     *           alibi in 1*h
+     *       elif rank_info == 1:
+     *           alibi in b*h
+     */
+    int rank_info;
+
+    void serialize(std::ostream& os) const
+    {
+        if(type == bias_enum::no_bias)
+            os << "n";
+        else if(type == bias_enum::elementwise_bias)
+        {
+            os << "e";
+            if(rank_info != 0)
+            {
+                os << "[" << rank_info << "]";
+            }
+        }
+        else if(type == bias_enum::alibi)
+        {
+            os << "alibi";
+            if(rank_info != 0)
+            {
+                os << "[" << rank_info << "]";
+            }
+        }
+    }
+
+    static bias_info decode(std::string str)
+    {
+        bias_info info{bias_enum::no_bias, 0};
+        auto found_0 = str.find(':');
+        if(found_0 != std::string::npos)
+        {
+            std::string t = str.substr(0, found_0);
+            std::string v = str.substr(found_0 + 1);
+            if(t == "e" || t == "elementwise")
+            {
+                info.type      = bias_enum::elementwise_bias;
+                info.rank_info = std::stoi(v);
+                if(info.rank_info < 0 || info.rank_info > 2)
+                    throw std::invalid_argument("invalid bias rank: " + str);
+            }
+            else if(t == "a" || t == "alibi")
+            {
+                info.type      = bias_enum::alibi;
+                info.rank_info = std::stoi(v);
+                if(info.rank_info < 0 || info.rank_info > 1)
+                    throw std::invalid_argument("invalid bias rank: " + str);
+            }
+            else
+            {
+                throw std::invalid_argument("invalid bias value: " + str);
+            }
+        }
+        else if(str == "0" || str == "n")
+        {
+            info.type = bias_enum::no_bias;
+        }
+        else if(str == "1" || str == "e" || str == "elementwise")
+        {
+            info.type = bias_enum::elementwise_bias;
+        }
+        else if(str == "2" || str == "a" || str == "alibi")
+        {
+            info.type = bias_enum::alibi;
+        }
+        else
+        {
+            throw std::invalid_argument("invalid bias value: " + str);
+        }
+        return info;
+    }
+
+    friend std::ostream& operator<<([[clang::lifetimebound]] std::ostream& os, const bias_info& bi)
+    {
+        bi.serialize(os);
+        return os;
+    }
+};
--- a/example/ck_tile/01_fmha/codegen/init.py
+++ b/example/ck_tile/01_fmha/codegen/init.py
@@ -0,0 +1,3 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
--- a/example/ck_tile/01_fmha/codegen/arch.py
+++ b/example/ck_tile/01_fmha/codegen/arch.py
@@ -0,0 +1,42 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+from dataclasses import dataclass, field
+from typing import Any, List, Callable
+
+
+@dataclass(frozen=True)
+class ArchTrait:
+    name: str
+    preprocessor_check: str = field(default=None)
+    device_name_check: str = field(default=None)
+    tag: str = field(default=None)
+    filename_suffix: str = field(default=None)
+
+    def __post_init__(self):
+        if self.preprocessor_check is None:
+            object.__setattr__(self, "preprocessor_check", f"defined(__{self.name}__)")
+        if self.device_name_check is None:
+            object.__setattr__(
+                self,
+                "device_name_check",
+                f'device_name.compare(0, {len(self.name)}, "{self.name}") == 0',
+            )
+        if self.tag is None:
+            object.__setattr__(self, "tag", f"ck_tile::{self.name}_t")
+        if self.filename_suffix is None:
+            object.__setattr__(self, "filename_suffix", f"_{self.name}")
+
+
+def get_factories_for_targets(
+    targets: List[str], get_factory: Callable[[str], Any]
+) -> List[Any]:
+    factories = dict()
+    for target in targets:
+        factory = get_factory(target)
+        factories[factory.arch.name] = factory
+    # Place more specific architectures first
+    factories = sorted(
+        list(factories.values()), key=lambda f: len(f.arch.name), reverse=True
+    )
+    return factories
--- a/example/ck_tile/01_fmha/codegen/cmake_config.py
+++ b/example/ck_tile/01_fmha/codegen/cmake_config.py
@@ -0,0 +1,4 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+# generate kernel instances to speed up compilation
+GEN_DIR = ""  # in Cmake, have to generate files in same folder
--- a/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py
+++ b/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py
@@ -0,0 +1,163 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+# generate kernel instances to speed up compilation
+FWD_DTYPE_MAP = {
+    "fp32": "FmhaFwdFp32",
+    "fp16": "FmhaFwdFp16",
+    "bf16": "FmhaFwdBf16",
+    "fp8": "FmhaFwdFp8",
+    "fp8fp16": "FmhaFwdFp8Fp16",
+    "fp8bf16": "FmhaFwdFp8Bf16",
+    "fp8fp32": "FmhaFwdFp8Fp32",
+    "mxfp8": "FmhaFwdMxFp8",
+    "mxfp4": "FmhaFwdMxFp4",
+}
+
+BWD_DTYPE_MAP = {"fp32": "FmhaBwdFp32", "fp16": "FmhaBwdFp16", "bf16": "FmhaBwdBf16"}
+
+MASK_IMPL = {
+    "generic": "ck_tile::GenericAttentionMask",
+    "simplified": "ck_tile::SimplifiedGenericAttentionMask",
+}
+
+_MASK_SIMPLIFIED_MAP = {
+    "s_no": "ck_tile::SimplifiedGenericAttentionMask<false>",
+    "s_mask": "ck_tile::SimplifiedGenericAttentionMask<true>",
+}
+
+_MASK_MAP = {
+    "no": "FmhaMasks::NoMask",
+    "causal": "FmhaMasks::CausalMask",
+    "generic": "FmhaMasks::GenericMask",
+}
+
+
+def get_mask_map(mask_impl: str):
+    if mask_impl == "generic":
+        return _MASK_MAP
+    elif mask_impl == "simplified":
+        return _MASK_SIMPLIFIED_MAP
+    else:
+        assert False
+        return None
+
+
+def get_mask_impl(mask: str) -> str:
+    return "simplified" if mask.startswith("s_") else "generic"
+
+
+def get_mask_cpp_type(mask: str) -> str:
+    return get_mask_map(get_mask_impl(mask))[mask]
+
+
+_MASK_CHECK_MAP = {
+    "no": "t.mask_type == mask_enum::no_mask",
+    "causal": "t.mask_type == mask_enum::mask_top_left || t.mask_type == mask_enum::mask_bottom_right",
+    "generic": "t.mask_type == mask_enum::window_generic",
+}
+
+_MASK_SIMPLIFIED_CHECK_MAP = {
+    "s_no": "t.mask_type == mask_enum::no_mask",
+    "s_mask": "t.mask_type != mask_enum::no_mask",
+}
+
+
+def get_mask_check_map(mask: str):
+    if mask == "generic":
+        return _MASK_CHECK_MAP
+    elif mask == "simplified":
+        return _MASK_SIMPLIFIED_CHECK_MAP
+    else:
+        assert False
+        return None
+
+
+def get_mask_cpp_check_expr(mask: str) -> str:
+    return get_mask_check_map(get_mask_impl(mask))[mask]
+
+
+QSCALE_MAP = {
+    "no": "ck_tile::BlockAttentionQuantScaleEnum::NO_SCALE",
+    "pertensor": "ck_tile::BlockAttentionQuantScaleEnum::PERTENSOR",
+    "blockscale": "ck_tile::BlockAttentionQuantScaleEnum::BLOCKSCALE",
+    "kv_blockscale": "ck_tile::BlockAttentionQuantScaleEnum::KV_BLOCKSCALE",
+    "mx": "ck_tile::BlockAttentionQuantScaleEnum::MX",
+}
+
+QSCALE_CHECK_MAP = {
+    "no": "quant_scale_enum::no_scale",
+    "pertensor": "quant_scale_enum::pertensor",
+    "blockscale": "quant_scale_enum::blockscale",
+    "kv_blockscale": "quant_scale_enum::kv_blockscale",
+    "mx": "quant_scale_enum::mx",
+}
+
+BIAS_MAP = {
+    "no": "ck_tile::BlockAttentionBiasEnum::NO_BIAS",
+    "bias": "ck_tile::BlockAttentionBiasEnum::ELEMENTWISE_BIAS",
+    "alibi": "ck_tile::BlockAttentionBiasEnum::ALIBI",
+}
+
+# TODO: this is ugly
+BIAS_CHECK_MAP = {
+    "no": "bias_enum::no_bias",
+    "bias": "bias_enum::elementwise_bias",
+    "alibi": "bias_enum::alibi",
+}
+
+DROPOUT_MAP = {
+    "no": "ck_tile::BlockDropoutBwd<false, true,  false>",
+    "dropout_wg32": "ck_tile::BlockDropoutBwd<true,  true,  false>",
+    "dropout_wg32_storerandval": "ck_tile::BlockDropoutBwd<true,  true,  true >",
+    "dropout_wg16": "ck_tile::BlockDropoutBwd<true,  false, false>",
+    "dropout_wg16_storerandval": "ck_tile::BlockDropoutBwd<true,  false, true >",
+}
+
+DROPOUT_CHECK_MAP = {
+    "no": "t.has_dropout == false",
+    "dropout_wg32": "t.has_dropout == true && t.is_store_randval == false",
+    "dropout_wg32_storerandval": "t.has_dropout == true && t.is_store_randval == true",
+    "dropout_wg16": "t.has_dropout == true && t.is_store_randval == false",
+    "dropout_wg16_storerandval": "t.has_dropout == true && t.is_store_randval == true",
+}
+
+ROPE_MAP = {
+    "no": "ck_tile::RotaryEmbeddingEnum::NONE",
+    "inter": "ck_tile::RotaryEmbeddingEnum::INTERLEAVED",
+    "half": "ck_tile::RotaryEmbeddingEnum::HALF_ROTATED",
+}
+
+ROPE_CHECK_MAP = {
+    "no": "rope_enum::none",
+    "inter": "rope_enum::interleaved",
+    "half": "rope_enum::half_rotated",
+}
+
+MODE_MAP = {"batch": "false", "group": "true"}
+
+LAYOUT_MAP = {"row": "true", "col": "false"}
+
+PIPELINE_MAP = {
+    "qr": "ck_tile::BlockFmhaPipelineQRKSVS",
+    "qr_async": "ck_tile::BlockFmhaPipelineQRKSVSAsync",
+    "qs": "ck_tile::BlockFmhaPipelineQSKSVS",
+    "qr_async_trload": "ck_tile::BlockFmhaPipelineQRKSVSAsyncTrload",
+    "qr_async_trload_v3": "ck_tile::BlockFmhaFwdV3Pipeline",
+}
+
+PIPELINE_ENUM_MAP = {
+    "qr": "ck_tile::BlockFmhaPipelineEnum::QRKSVS",
+    "qr_async": "ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC",
+    "qr_nwarp_sshuffle": "ck_tile::BlockFmhaPipelineEnum::QRKSVS",
+    "qs": "ck_tile::BlockFmhaPipelineEnum::QSKSVS",
+    "qr_pagedkv": "ck_tile::BlockFmhaPipelineEnum::QRKSVS",
+    "qr_async_trload": "ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC_TRLOAD",
+    "qr_async_trload_v3": "ck_tile::BlockFmhaPipelineEnum::QRKSVS_ASYNC_TRLOAD_V3",
+}
+
+BOOL_MAP = {
+    "t": "true",
+    "f": "false",
+    True: "true",
+    False: "false",
+}
--- a/example/ck_tile/01_fmha/codegen/ops/init.py
+++ b/example/ck_tile/01_fmha/codegen/ops/init.py
@@ -0,0 +1,3 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_batch_prefill.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_batch_prefill.py
@@ -0,0 +1,849 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+# generate kernel instances to speed up compilation
+import copy
+from dataclasses import dataclass, field
+import fnmatch
+import itertools
+from pathlib import Path
+from typing import List, Optional, Tuple
+
+from codegen.cmake_config import GEN_DIR
+from codegen.cpp_symbol_map import (
+    MODE_MAP,
+    LAYOUT_MAP,
+    BIAS_CHECK_MAP,
+    get_mask_check_map,
+    get_mask_map,
+    BIAS_MAP,
+    FWD_DTYPE_MAP,
+    BOOL_MAP,
+    PIPELINE_ENUM_MAP,
+    QSCALE_CHECK_MAP,
+    QSCALE_MAP,
+)
+from codegen.utils import update_file
+
+DTYPE_BITS = {
+    "fp32": 32,
+    "fp16": 16,
+    "bf16": 16,
+    "fp8": 8,
+    "fp8bf16": 8,
+    "fp8fp32": 8,
+    "bf8": 8,
+}
+
+K0_MAX_SUBMAX_MAP = {32: 32, 64: 64, 96: 128, 128: 128, 256: 256}
+
+SUPPORTED_PAGE_SIZE = [1, 16, 1024]
+SUPPORTED_KV_MEMORY_LAYOUT = ["vectorized", "linear"]
+SUPPORTED_KV_LOOKUP_TABLE = ["vllm", "sglang"]
+KV_MEMORY_LAYOUT_ENUM_MAP = {
+    "vectorized": "ck_tile::BlockAttentionKVCacheMemoryLayoutEnum::VECTORIZED_LAYOUT",
+    "linear": "ck_tile::BlockAttentionKVCacheMemoryLayoutEnum::LINEAR_LAYOUT",
+}
+KV_LOOKUP_TABLE_ENUM_MAP = {
+    "vllm": "ck_tile::BlockAttentionKVCacheLookupTableEnum::VLLM_BLOCK_TABLE_2D",
+    "sglang": "ck_tile::BlockAttentionKVCacheLookupTableEnum::SGLANG_PAGE_TABLE_1D",
+}
+
+
+FMHA_BATCH_PREFILL_PIPELINE_MAP = {
+    "qr_async": "ck_tile::BlockFmhaBatchPrefillPipelineQRKSVSAsync",
+}
+
+FMHA_FWD_KERNEL_HEADER = """// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.\n
+// auto generated by generate.py
+#include "ck_tile/ops/fmha/block/variants.hpp"
+#include "fmha_fwd.hpp"
+"""
+
+FMHA_FWD_KERNEL_BODY = """
+using fmha_dtype_{F_idx} = {F_dtype};
+
+using fmha_block_tile_{F_idx} = ck_tile::sequence<{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}>;
+
+using fmha_shape_{F_idx} = ck_tile::TileFmhaShape<fmha_block_tile_{F_idx},
+                                      ck_tile::sequence<{F_rm0}, {F_rn0}, {F_rk0}>,
+                                      ck_tile::sequence<{F_wm0}, {F_wn0}, {F_wk0}>,
+                                      ck_tile::sequence<{F_rm1}, {F_rn1}, {F_rk1}>,
+                                      ck_tile::sequence<{F_wm1}, {F_wn1}, {F_wk1}>,
+                                      {F_vlayout}>;
+
+using fmha_trait_{F_idx} = ck_tile::TileFmhaBatchPrefillTraits<{F_spad},
+                                                    {F_skpad},
+                                                    {F_dpad},
+                                                    {F_dvpad},
+                                                    {F_logits},
+                                                    {F_bias},
+                                                    false,
+                                                    {F_lse},
+                                                    {F_dropout},
+                                                    {F_qscale},
+                                                    {F_occupancy},
+                                                    false,
+                                                    {F_page_size},
+                                                    {F_kv_memory_layout},
+                                                    {F_kv_lookup_table}>;
+
+using fmha_variant_{F_idx} = ck_tile::ComposedAttention<{F_logits} * ck_tile::LOGITS_SOFT_CAP, CK_TILE_FMHA_FWD_FAST_EXP2>;
+
+using fmha_mask_{F_idx} = {F_mask};
+
+using fmha_pipeline_problem_{F_idx} = ck_tile::BlockFmhaBatchPrefillPipelineProblem<
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::QDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::KDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::VDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::SaccDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::SMPLComputeDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::BiasDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::RandValOutputDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::LSEDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::PDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::OaccDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::ODataType,
+    fmha_shape_{F_idx},
+    {F_mode},
+    fmha_variant_{F_idx},
+    fmha_mask_{F_idx},
+    false,
+    {F_page_size},
+    fmha_trait_{F_idx}>;
+
+using fmha_pipeline_{F_idx} = {F_pipeline}<
+    fmha_pipeline_problem_{F_idx}>;
+
+using fmha_epilogue_{F_idx} =
+    ck_tile::Default2DEpilogue<ck_tile::Default2DEpilogueProblem<typename FmhaFwdTypeConfig<{F_dtype}>::OaccDataType,
+                                           typename FmhaFwdTypeConfig<{F_dtype}>::ODataType,
+                                           {F_spad}, {F_dvpad}>>;
+
+using fmha_kernel_{F_idx} =
+    ck_tile::FmhaBatchPrefillWithPagedKVCacheKernel<fmha_pipeline_{F_idx}, fmha_epilogue_{F_idx}>;
+
+using trait_{F_idx} = fmha_fwd_batch_prefill_traits_<{F_hdim}, {F_dtype}, {F_mode},{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout},
+                        {F_pipeline_enum}, {F_logits}, fmha_mask_{F_idx}, {F_bias}, {F_lse}, {F_dropout}, {F_qscale}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, false, false, {F_page_size}, {F_kv_memory_layout}, {F_kv_lookup_table}>;
+
+#include <iostream>
+
+template<>
+float fmha_batch_prefill_<trait_{F_idx}>(const ck_tile::stream_config& s, fmha_batch_prefill_args a)
+{{
+    using k_ = fmha_kernel_{F_idx};
+    if(s.log_level_ > 0)
+        std::cout << ", {F_kname}" << std::flush;
+    auto [kargs, grids] = fmha_batch_prefill_create_kargs_and_grids<k_>(a);
+    const dim3 blocks                      = k_::BlockSize();
+    constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu;
+    return ck_tile::launch_kernel(s, ck_tile::make_kernel<kBlockPerCu>(k_{{}}, grids, blocks, 0, kargs));
+}}
+"""
+
+FMHA_FWD_API_FILENAME = "fmha_batch_prefill_api.cpp"
+FMHA_FWD_API = """
+#include <cstdio>
+
+namespace {{
+bool get_num_cus(unsigned& num_cu) {{
+    int device;
+    auto status = hipGetDevice(&device);
+    if(status != hipSuccess) {{
+        fprintf(stderr, "failed to get device");
+        return false;
+    }}
+
+    hipDeviceProp_t props{{}};
+    status = hipGetDeviceProperties(&props, device);
+    if(status != hipSuccess) {{
+        fprintf(stderr, "failed to get device properties");
+        return false;
+    }}
+
+    num_cu = props.multiProcessorCount;
+    return true;
+}}
+
+unsigned get_num_thread_blocks(unsigned batch, unsigned nheads, unsigned max_seqlen_q, unsigned kM0) {{
+    const unsigned num_m_blocks = (max_seqlen_q + kM0 - 1) / kM0;
+    const unsigned num_n_blocks = 1; // we assume that num_n_blocks is always 1
+
+    return batch * nheads * num_m_blocks * num_n_blocks;
+}}
+}} // namespace
+
+float fmha_batch_prefill(fmha_batch_prefill_traits t, fmha_batch_prefill_args a, const ck_tile::stream_config& s) {{
+    float r = -1;
+
+    [[maybe_unused]] const float min_cu_util_rate = 0.8; // minimum CU utilization rate
+
+    unsigned num_cus;
+    if (!get_num_cus(num_cus)) {{
+        return r;
+    }}
+
+    [[maybe_unused]] auto get_num_blocks = [&](unsigned kM0) {{
+        return get_num_thread_blocks(a.batch, a.nhead_q, a.max_seqlen_q, kM0);
+    }};
+
+{F_dispatch}
+    return r;
+}}
+"""
+
+FMHA_FWD_API_PER_DTYPE = """    {F_if}(t.data_type.compare(\"{F_dtype}\") == 0){{
+{F_hdim_case}
+    }}
+"""
+FMHA_FWD_API_PER_HDIM_CASE = """        {F_if} (t.hdim_q <= {F_hdim} && t.hdim_v <= {F_hdim_v}) {{
+{F_inner_dispatch}
+        }}
+"""
+
+FMHA_FWD_API_INNER_DISPATCH = """            {F_if}((t.is_group_mode == {F_mode}) && (t.is_v_rowmajor == {F_vlayout}) && (t.has_logits_soft_cap == {F_logits}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.has_lse == {F_lse})  && (t.has_dropout == {F_dropout}) && (t.qscale_type == {F_qscale_check}) &&
+                        ({F_scheck}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck}) && ({F_constraint}) && (t.kv_memory_layout == {F_kv_memory_layout}) && (t.kv_lookup_table == {F_kv_lookup_table}) && (t.page_size == {F_page_size})) {{
+                using trait_ = fmha_fwd_batch_prefill_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, {F_logits}, {F_mask}, {F_bias}, {F_lse}, {F_dropout}, {F_qscale}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, false, false, {F_page_size}, {F_kv_memory_layout}, {F_kv_lookup_table}>;
+                return fmha_batch_prefill_<trait_>(s, a);
+            }}
+"""
+
+
+@dataclass
+class CppConstraint:
+    bool_expr: str = None
+
+    def __str__(self):
+        if self.bool_expr is None:
+            return "true"
+        else:
+            return f"{self.bool_expr}"
+
+    def __and__(self, other):
+        return CppConstraint(f"({str(self)}) && ({str(other)})")
+
+
+@dataclass
+class FmhaFwdApiTrait:
+    pipeline_tag: str
+    # sync with fmha_fwd_traits<>, to generate fallback calls
+    hdim: str
+    dtype: str  # data type
+    mode: str  # value from MODE_MAP
+    bm0: int  # tile size along q seqlen (block size)
+    bn0: int  # tile size along qk seqlen
+    bk0: int  # tile size along qk gemm unroll
+    bn1: int  # tile size along v head_dim
+    bk1: int  # tile size along kv gemm unroll
+    bk0max: int
+    vlayout: str
+    logits: str
+    mask: str
+    bias: str  #
+    lse: str  #
+    dropout: str
+    qscale: str  #
+    spad: str
+    skpad: str
+    dpad: str
+    dvpad: str
+    constraint: CppConstraint
+    kv_memory_layout: str
+    kv_lookup_table: str
+    page_size: int = 1  # page block size
+
+    @property
+    def name(self) -> str:
+        return (
+            f"{self.hdim}-{self.dtype}-{self.mode}-{self.bm0}-{self.bn0}-{self.bk0}-{self.bn0}-{self.bk1}-{self.bk0max}-"
+            + f"{self.vlayout}-{self.logits}-{self.mask}-{self.bias}-{self.lse}-{self.dropout}-{self.qscale}-{self.spad}-{self.skpad}-{self.dpad}-{self.dvpad}-{self.kv_memory_layout}-{self.kv_lookup_table}-ps{self.page_size}"
+        )
+
+    @property
+    def scheck(self) -> str:
+        if self.mode == "group":
+            return "true/*group mode spad always true*/"  # group mode only generate spad/skpad == true
+        if self.pipeline_tag == "qr_async":
+            if self.spad == "t":
+                return "true"  # always support
+            else:
+                return "true"
+        elif self.pipeline_tag in ["qr"]:
+            if self.spad == "t":
+                return f"true /*a.seqlen_q % {self.bm0} != 0*/"  # TODO: order of get_pipelines() matters! (ugly)
+            else:
+                return f"a.seqlen_q % {self.bm0} == 0"
+        else:
+            assert False
+
+    @property
+    def skcheck(self) -> str:
+        if self.mode == "group":
+            return "true/*group mode skpad always true*/"  # group mode only generate spad/skpad == true
+        if self.pipeline_tag == "qr_async":
+            if self.skpad == "t":
+                return f"a.seqlen_k == 0 || a.seqlen_k % {self.bn0} != 0"
+            else:
+                return f"a.seqlen_k != 0 && a.seqlen_k % {self.bn0} == 0"
+        elif self.pipeline_tag in ["qr", "qr_fp8"]:
+            if self.skpad == "t":
+                return f"true /*a.seqlen_k % {self.bn0} != 0*/"  # TODO: order of get_pipelines() matters! (ugly)
+            else:
+                return f"a.seqlen_k % {self.bn0} == 0"
+        else:
+            assert False
+
+    @property
+    def dcheck(self) -> str:
+        if self.pipeline_tag == "qr_async":
+            vec = int((32 * 4) / DTYPE_BITS[self.dtype])
+            if self.dpad == "t":
+                return f"a.hdim_q % {vec} == 0"
+            else:
+                assert False
+        elif self.pipeline_tag in ["qr"]:
+            bk0submax = K0_MAX_SUBMAX_MAP[self.bk0max]
+            if self.dpad == "t":
+                return f"true /*a.hdim_q % {bk0submax} != 0*/"  # TODO: order of get_pipelines() matters! (ugly)
+            else:
+                return f"a.hdim_q % {bk0submax} == 0"
+        else:
+            assert False
+
+    @property
+    def dvcheck(self) -> str:
+        if self.pipeline_tag == "qr_async":
+            vec = int((32 * 4) / DTYPE_BITS[self.dtype])
+            if self.dvpad == "t":
+                return f"a.hdim_v % {vec} == 0"
+            else:
+                assert False
+        elif self.pipeline_tag in ["qr"]:
+            bk0submax = K0_MAX_SUBMAX_MAP[self.bk0max]
+            if self.dvpad == "t":
+                return f"true /*a.hdim_v % {bk0submax} != 0*/"  # TODO: order of get_pipelines() matters! (ugly)
+            else:
+                return f"a.hdim_v % {bk0submax} == 0"
+        else:
+            assert False
+
+
+@dataclass
+class FmhaFwdPipeline:
+    tag: str
+
+    F_vlayout: str  # row/col
+    F_spad: str  # true/false
+    F_skpad: str  #
+    F_dpad: str  #
+    F_dvpad: str  #
+    F_logits: str  # t/f
+    F_bias: str  # true/false
+    F_lse: str  #
+    F_dropout: str  #
+    F_qscale: str  # no/pertensor
+    F_mask: str  # value from MASK_MAP
+    F_kv_memory_layout: str  #
+    F_kv_lookup_table: str  #
+    F_constraint: CppConstraint = field(default_factory=lambda: CppConstraint())
+
+    @property
+    def name(self) -> str:
+        def pad_name() -> str:
+            n = ""
+            if self.F_spad == "t":
+                n += "s"
+            if self.F_skpad == "t":
+                n += "sk"
+            if self.F_dpad == "t":
+                n += "d"
+            if self.F_dvpad == "t":
+                n += "dv"
+            if n != "":
+                n = "p" + n
+            return n
+
+        pn = pad_name()
+        n = f"{self.tag}_v{self.F_vlayout[0]}"
+        if pn != "":
+            n += f"_{pn}"
+        else:
+            n += "_npad"
+
+        if self.F_logits == "t":
+            n += "_logits"
+        else:
+            n += "_nlogits"
+
+        if self.F_bias != "no":
+            n += f"_{self.F_bias}"
+        else:
+            n += "_nbias"
+
+        if self.F_mask[0:2] == "s_":
+            if self.F_mask == "s_mask":
+                n += "_mask"
+            else:
+                n += "_nmask"
+        else:
+            if self.F_mask != "no":
+                n += f"_m{self.F_mask[0]}"
+            else:
+                n += "_nmask"
+
+        if self.F_lse == "t":
+            n += "_lse"
+        else:
+            n += "_nlse"
+
+        if self.F_dropout == "t":
+            n += "_dropout"
+        else:
+            n += "_ndropout"
+
+        if self.F_qscale != "no":
+            n += f"_{self.F_qscale}"
+        else:
+            n += "_nqscale"
+
+        n += "_" + self.F_kv_memory_layout + "_" + self.F_kv_lookup_table
+        return n
+
+
+class FmhaFwdApiPool:
+    def __init__(self, mask_impl):
+        self.pool = dict()
+        self.mask_impl = mask_impl
+
+    def register_traits(self, trait: FmhaFwdApiTrait) -> None:
+        # TODO: do we need to check duplication?
+        if trait.dtype not in self.pool.keys():
+            self.pool[trait.dtype] = dict()
+        if trait.hdim not in self.pool[trait.dtype].keys():
+            self.pool[trait.dtype][trait.hdim] = list()
+
+        self.pool[trait.dtype][trait.hdim].append(copy.copy(trait))
+
+    @property
+    def api(self) -> str:
+        per_dtypes = str()
+        for i, dtype in enumerate(self.pool.keys()):
+            per_hdim_case = str()
+            for j, hdim in enumerate(self.pool[dtype].keys()):
+                traits = self.pool[dtype][hdim]
+                inners = str()
+                for k, trait in enumerate(traits):
+                    if_k = "if" if k == 0 else "else if"
+                    inners = inners + FMHA_FWD_API_INNER_DISPATCH.format(
+                        F_if=if_k,
+                        F_mode=MODE_MAP[trait.mode],
+                        F_vlayout=LAYOUT_MAP[trait.vlayout],
+                        F_pipeline_enum=PIPELINE_ENUM_MAP[trait.pipeline_tag],
+                        F_logits=BOOL_MAP[trait.logits],
+                        F_mask=get_mask_map(self.mask_impl)[trait.mask],
+                        F_mask_check=get_mask_check_map(self.mask_impl)[trait.mask],
+                        F_bias_check=BIAS_CHECK_MAP[trait.bias],
+                        F_bias=BIAS_MAP[trait.bias],
+                        F_lse=BOOL_MAP[trait.lse],
+                        F_dropout=BOOL_MAP[trait.dropout],
+                        F_qscale_check=QSCALE_CHECK_MAP[trait.qscale],
+                        F_qscale=QSCALE_MAP[trait.qscale],
+                        F_scheck=trait.scheck,
+                        F_skcheck=trait.skcheck,
+                        F_dcheck=trait.dcheck,
+                        F_dvcheck=trait.dvcheck,
+                        F_constraint=trait.constraint,
+                        F_spad=BOOL_MAP[trait.spad],
+                        F_skpad=BOOL_MAP[trait.skpad],
+                        F_dpad=BOOL_MAP[trait.dpad],
+                        F_dvpad=BOOL_MAP[trait.dvpad],
+                        F_bm0=trait.bm0,
+                        F_bn0=trait.bn0,
+                        F_bk0=trait.bk0,
+                        F_bn1=trait.bn1,
+                        F_bk1=trait.bk1,
+                        F_bk0max=trait.bk0max,
+                        F_hdim=hdim,
+                        F_dtype=FWD_DTYPE_MAP[dtype],
+                        F_kv_memory_layout=KV_MEMORY_LAYOUT_ENUM_MAP[
+                            trait.kv_memory_layout
+                        ],
+                        F_kv_lookup_table=KV_LOOKUP_TABLE_ENUM_MAP[
+                            trait.kv_lookup_table
+                        ],
+                        F_page_size=trait.page_size,
+                    )
+                if_j = "if" if j == 0 else "else if"
+                per_hdim_case = per_hdim_case + FMHA_FWD_API_PER_HDIM_CASE.format(
+                    F_if=if_j, F_hdim=hdim, F_hdim_v=trait.bn1, F_inner_dispatch=inners
+                )
+            if_i = "if" if i == 0 else "else if"
+            per_dtypes = per_dtypes + FMHA_FWD_API_PER_DTYPE.format(
+                F_if=if_i, F_dtype=dtype, F_hdim_case=per_hdim_case
+            )
+        if not per_dtypes:
+            # empty string we add some ignore to suppress warning in api
+            per_dtypes += "    (void)t; (void)s; (void)a;"
+        return FMHA_FWD_KERNEL_HEADER + FMHA_FWD_API.format(F_dispatch=per_dtypes)
+
+
+@dataclass
+class FmhaFwdTileSize:
+    F_bm0: int  # tile size along q seqlen (block size)
+    F_bn0: int  # tile size along k seqlen
+    F_bk0: int  # tile size along qk gemm unroll
+    F_bn1: int  # tile size along v head_dim
+    F_bk1: int  # tile size along kv gemm unroll
+    F_bk0max: int  # total length of K0, used for pipeline that need load Q at once (or repeately load Q as a whole tile)
+    F_rm0: int  # number of warps for gemm0 along q seqlen
+    F_rn0: int  # number of warps for gemm0 along k seqlen
+    F_rk0: int  # number of warps for gemm0 along head dim q (not used)
+    F_rm1: int  # number of warps for gemm1 along q seqlen
+    F_rn1: int  # number of warps for gemm1 along head dim v
+    F_rk1: int  # number of warps for gemm1 along k seqlen (not used)
+    F_wm0: int  # gemm0 warp size along m
+    F_wn0: int  # gemm0 warp size along n
+    F_wk0: int  # gemm0 warp size along k
+    F_wm1: int  # gemm1 warp size along m
+    F_wn1: int  # gemm1 warp size along n
+    F_wk1: int  # gemm1 warp size along k
+    F_occupancy: int  # occupancy, -1 will let pipeline decide the occupancy, other value will overwrite occupancy
+    F_constraint: CppConstraint = field(default_factory=lambda: CppConstraint())
+
+    @property
+    def name(self) -> str:
+        return (
+            f"b{self.F_bm0}x{self.F_bn0}x{self.F_bk0}x{self.F_bn1}x{self.F_bk1}x{self.F_bk0max}"
+            + f"_r{self.F_rm0}x{self.F_rn0}x{self.F_rk0}_r{self.F_rm1}x{self.F_rn1}x{self.F_rk1}"
+            + f"_w{self.F_wm0}x{self.F_wn0}x{self.F_wk0}_w{self.F_wm1}x{self.F_wn1}x{self.F_wk1}"
+            + ("" if self.F_occupancy == -1 else f"_o{self.F_occupancy}")
+        )
+
+
+@dataclass
+class FmhaFwdKernel:
+    F_idx: int  # this is not a tunable, but a counter to differentiate symbol
+    F_hdim: int  # hdim
+    F_dtype: str  # data type
+    F_mode: str  # value from MODE_MAP
+    F_tile: FmhaFwdTileSize
+    F_pipeline: FmhaFwdPipeline
+    mask_impl: str
+    F_page_size: int = 1  # page block size
+
+    @property
+    def template(self) -> str:
+        return FMHA_FWD_KERNEL_HEADER + FMHA_FWD_KERNEL_BODY.format(
+            F_kname=self.name,
+            F_idx=self.F_idx,
+            F_hdim=self.F_hdim,
+            F_dtype=FWD_DTYPE_MAP[self.F_dtype],
+            F_bm0=self.F_tile.F_bm0,
+            F_bn0=self.F_tile.F_bn0,
+            F_bk0=self.F_tile.F_bk0,
+            F_bn1=self.F_tile.F_bn1,
+            F_bk1=self.F_tile.F_bk1,
+            F_bk0max=self.F_tile.F_bk0max,
+            F_rm0=self.F_tile.F_rm0,
+            F_rn0=self.F_tile.F_rn0,
+            F_rk0=self.F_tile.F_rk0,
+            F_rm1=self.F_tile.F_rm1,
+            F_rn1=self.F_tile.F_rn1,
+            F_rk1=self.F_tile.F_rk1,
+            F_wm0=self.F_tile.F_wm0,
+            F_wn0=self.F_tile.F_wn0,
+            F_wk0=self.F_tile.F_wk0,
+            F_wm1=self.F_tile.F_wm1,
+            F_wn1=self.F_tile.F_wn1,
+            F_wk1=self.F_tile.F_wk1,
+            F_vlayout=LAYOUT_MAP[self.F_pipeline.F_vlayout],
+            F_spad=BOOL_MAP[self.F_pipeline.F_spad],
+            F_skpad=BOOL_MAP[self.F_pipeline.F_skpad],
+            F_dpad=BOOL_MAP[self.F_pipeline.F_dpad],
+            F_dvpad=BOOL_MAP[self.F_pipeline.F_dvpad],
+            F_logits=BOOL_MAP[self.F_pipeline.F_logits],
+            F_bias=BIAS_MAP[self.F_pipeline.F_bias],
+            F_lse=BOOL_MAP[self.F_pipeline.F_lse],
+            F_dropout=BOOL_MAP[self.F_pipeline.F_dropout],
+            F_qscale=QSCALE_MAP[self.F_pipeline.F_qscale],
+            F_occupancy=self.F_tile.F_occupancy,
+            F_kv_memory_layout=KV_MEMORY_LAYOUT_ENUM_MAP[
+                self.F_pipeline.F_kv_memory_layout
+            ],
+            F_kv_lookup_table=KV_LOOKUP_TABLE_ENUM_MAP[
+                self.F_pipeline.F_kv_lookup_table
+            ],
+            F_pipeline_enum=PIPELINE_ENUM_MAP[self.F_pipeline.tag],
+            F_mask=get_mask_map(self.mask_impl)[self.F_pipeline.F_mask],
+            F_mode=MODE_MAP[self.F_mode],
+            F_pipeline=FMHA_BATCH_PREFILL_PIPELINE_MAP[self.F_pipeline.tag],
+            F_page_size=self.F_page_size,
+        )
+
+    @property
+    def name(self) -> str:
+        # TODO: we don't encode idx here
+        return (
+            f"fmha_batch_prefill_d{self.F_hdim}_{self.F_dtype}_{self.F_mode}_ps{self.F_page_size}_"
+            + self.F_tile.name
+            + "_"
+            + self.F_pipeline.name
+        )
+
+    @property
+    def filename(self) -> str:
+        return self.name + ".cpp"
+
+    def api_trait(self) -> FmhaFwdApiTrait:
+        return FmhaFwdApiTrait(
+            pipeline_tag=self.F_pipeline.tag,
+            hdim=str(self.F_hdim),
+            dtype=self.F_dtype,
+            mode=self.F_mode,
+            bm0=self.F_tile.F_bm0,
+            bn0=self.F_tile.F_bn0,
+            bk0=self.F_tile.F_bk0,
+            bn1=self.F_tile.F_bn1,
+            bk1=self.F_tile.F_bk1,
+            bk0max=self.F_tile.F_bk0max,
+            vlayout=self.F_pipeline.F_vlayout,
+            mask=self.F_pipeline.F_mask,
+            logits=self.F_pipeline.F_logits,
+            bias=self.F_pipeline.F_bias,
+            lse=self.F_pipeline.F_lse,
+            dropout=self.F_pipeline.F_dropout,
+            qscale=self.F_pipeline.F_qscale,
+            spad=self.F_pipeline.F_spad,
+            skpad=self.F_pipeline.F_skpad,
+            dpad=self.F_pipeline.F_dpad,
+            dvpad=self.F_pipeline.F_dvpad,
+            constraint=self.F_tile.F_constraint & self.F_pipeline.F_constraint,
+            kv_memory_layout=self.F_pipeline.F_kv_memory_layout,
+            kv_lookup_table=self.F_pipeline.F_kv_lookup_table,
+            page_size=self.F_page_size,
+        )
+
+
+class KernelComponentFactory:
+    @staticmethod
+    def get_hdim_tile_size_dict(dtype: str) -> Optional[dict]:
+        if dtype in ["fp16", "bf16"]:
+            return {
+                128 : [FmhaFwdTileSize(128, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
+                256 : [FmhaFwdTileSize(128, 128, 32, 256, 32,  256,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1)],
+            }  # fmt: skip
+        elif dtype in ["fp8bf16"]:
+            return {
+                128 : [FmhaFwdTileSize(128, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  32, 32, 32,  32, 32, 32,  -1)],
+            }  # fmt: skip
+        else:
+            return None
+
+    @staticmethod
+    def get_pipelines(dtype, hdim, receipt, mask_impl) -> List[FmhaFwdPipeline]:
+        # this function will populate a list possible pipelines
+        # TODO: the order of List matters! the later in this list will be also be checked later
+        # TODO: currently for qr pipeline, let 't' padding to appear later!!
+        # TODO: how to design this more generic?
+        pipelines = []
+        if dtype in ["fp16", "bf16"]:
+            qscale = "no"
+            for (
+                logits,
+                mask,
+                bias,
+                lse,
+                dropout,
+                kv_memory_layout,
+                kv_lookup_table,
+            ) in itertools.product(
+                ["t", "f"],
+                get_mask_map(mask_impl).keys(),
+                BIAS_MAP.keys(),
+                ["t", "f"],
+                ["t", "f"],
+                SUPPORTED_KV_MEMORY_LAYOUT,
+                SUPPORTED_KV_LOOKUP_TABLE,
+            ):
+                pipelines.append(FmhaFwdPipeline("qr_async", "row", "t", "t", "t", "t", logits, bias, lse, dropout, qscale, mask, kv_memory_layout, kv_lookup_table))  # fmt: skip
+        elif dtype in ["fp8bf16"]:
+            # no need lse/dropout kernels
+            for (
+                logits,
+                qscale,
+                mask,
+                bias,
+                kv_memory_layout,
+                kv_lookup_table,
+            ) in itertools.product(
+                ["t", "f"],
+                ["pertensor", "kv_blockscale"],
+                get_mask_map(mask_impl).keys(),
+                ["no"],
+                SUPPORTED_KV_MEMORY_LAYOUT,
+                SUPPORTED_KV_LOOKUP_TABLE,
+            ):
+                pipelines.append(FmhaFwdPipeline("qr_async", "row", "t", "t", "t", "t", logits, bias, "f", "f", qscale, mask, kv_memory_layout, kv_lookup_table))  # fmt: skip
+        else:
+            assert False
+        return pipelines
+
+
+class CustomFactory(KernelComponentFactory):
+    @staticmethod
+    def get_hdim_tile_size_dict(dtype: str) -> Optional[dict]:
+        result = KernelComponentFactory.get_hdim_tile_size_dict(dtype)
+        if dtype in ["fp16", "bf16"]:
+            if 128 in result.keys():
+                result[128].insert(0, FmhaFwdTileSize( 64, 128, 64, 128, 64,  128,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1, CppConstraint("get_num_blocks(128) < num_cus * min_cu_util_rate")))  # fmt: skip
+        return result
+
+
+def get_fwd_blobs(
+    kernel_filter: Optional[str], receipt, optdim_list, mask_impl
+) -> Tuple[FmhaFwdApiPool, List[FmhaFwdKernel]]:
+    # TODO: we don't support tuning yet, so pick up one value for vlayout/pipeline/pad
+    #       support this in future
+
+    gen = list()
+    api_pool = FmhaFwdApiPool(mask_impl)
+
+    for dtype in FWD_DTYPE_MAP.keys():
+        d = CustomFactory.get_hdim_tile_size_dict(dtype)
+        if d is None:
+            continue
+        # for hdim_str, mode, mask, bias, lse in itertools.product(d.keys(), MODE_MAP.keys(), MASK_MAP.keys(), ["t", "f"], ["t", "f"]):
+        for (hdim, tiles), mode in itertools.product(d.items(), MODE_MAP.keys()):
+            for tile, pipeline in itertools.product(
+                tiles, CustomFactory.get_pipelines(dtype, hdim, receipt, mask_impl)
+            ):
+                if mode == "group":
+                    if pipeline.F_spad != "t" or pipeline.F_skpad != "t":
+                        # in group mode, spad/skpad must be true, since we can't predict if seqlen of current batch need pad or not
+                        continue
+                if hdim == 192 and tile.F_bn1 == 128:
+                    # NOTE: this is used to speedup deepseek prefill case, we don't gen training
+                    if (
+                        pipeline.F_bias != "no"
+                        or pipeline.F_lse == "t"
+                        or pipeline.F_dropout == "t"
+                    ):
+                        continue
+                # logits_soft_cap is only allowed if no bias
+                if not (
+                    (pipeline.F_logits == "t" and pipeline.F_bias == "no")
+                    or pipeline.F_logits == "f"
+                ):
+                    continue
+
+                # Generate kernels for both page_size=16 and page_size=1024
+                for page_size in SUPPORTED_PAGE_SIZE:
+                    if page_size == 1 and pipeline.F_kv_memory_layout != "linear":
+                        continue
+                    # kv_blockscale requires page_size >= kN0 (tile.F_bn0)
+                    # This ensures all tokens in a main loop iteration belong to the same page
+                    if pipeline.F_qscale == "kv_blockscale" and page_size < tile.F_bn0:
+                        continue
+                    k = FmhaFwdKernel(
+                        F_idx=0,
+                        F_hdim=hdim,
+                        F_dtype=dtype,
+                        F_mode=mode,
+                        F_tile=tile,
+                        F_pipeline=pipeline,
+                        mask_impl=mask_impl,
+                        F_page_size=page_size,
+                    )
+                    if kernel_filter != "":
+                        if not fnmatch.fnmatch(k.name, kernel_filter):
+                            continue
+                    if optdim_list != [-1]:
+                        if hdim not in optdim_list:
+                            continue
+                    # 2 - Flash attention integration
+                    if receipt in (2, 3):
+                        cond = dtype in ["fp16", "bf16"]
+                        cond &= pipeline.F_vlayout == "row"
+                        cond &= pipeline.F_bias in ["no", "alibi"]
+                        cond &= pipeline.F_qscale == "no"
+                        if not cond:
+                            continue
+                    # PyTorch integration
+                    elif receipt == 4:
+                        cond = dtype in ["fp16", "bf16"]
+                        cond &= pipeline.F_vlayout == "row"
+                        cond &= pipeline.F_bias in ["no", "bias"]
+                        cond &= pipeline.F_qscale == "no"
+                        if not cond:
+                            continue
+                    # Aiter(mha_fwd) integration
+                    elif receipt == 100:
+                        cond = dtype in ["fp16", "bf16"]
+                        cond &= mode == "batch"
+                        cond &= pipeline.F_vlayout == "row"
+                        cond &= pipeline.F_qscale == "no"
+                        if not cond:
+                            continue
+                    # Aiter(mha_batch_prefill) integration
+                    elif receipt == 200:
+                        cond = dtype in ["fp16", "bf16", "fp8bf16"]
+                        cond &= mode == "group"
+                        cond &= pipeline.F_vlayout == "row"
+                        if not cond:
+                            continue
+                    # aiter::mha_batch_prefill C++ api integration
+                    elif receipt == 600:
+                        cond = dtype in ["fp16", "bf16", "fp8bf16"]
+                        cond &= mode == "group"
+                        cond &= pipeline.F_vlayout == "row"
+                        cond &= pipeline.F_qscale == "no"
+                        if not cond:
+                            continue
+
+                    # fp32 only
+                    if receipt == 800 or receipt == 801:
+                        cond = dtype == "fp32"
+                        if not cond:
+                            continue
+
+                    api_pool.register_traits(k.api_trait())
+                    gen.append(k)
+
+    return (api_pool, gen)
+
+
+def write_single_fwd_kernel(kernel: FmhaFwdKernel, autogen_dir: Path) -> None:
+    update_file(autogen_dir / kernel.filename, kernel.template)
+
+
+def write_fwd_api(api_pool: FmhaFwdApiPool, autogen_dir: Path) -> None:
+    update_file(autogen_dir / FMHA_FWD_API_FILENAME, api_pool.api)
+
+
+def write_blobs(
+    targets: List[str],
+    output_dir: Path,
+    kernel_filter: str,
+    receipt,
+    optdim_list,
+    mask_impl,
+) -> None:
+    api_pool, kernels = get_fwd_blobs(kernel_filter, receipt, optdim_list, mask_impl)
+    for kernel in kernels:
+        write_single_fwd_kernel(kernel, output_dir)
+    write_fwd_api(api_pool, output_dir)
+
+
+def list_blobs(
+    targets: List[str],
+    file_path: Path,
+    kernel_filter: str,
+    receipt,
+    optdim_list,
+    mask_impl,
+) -> None:
+    with file_path.open("a") as f:
+        _, kernels = get_fwd_blobs(kernel_filter, receipt, optdim_list, mask_impl)
+        for kernel in kernels:
+            f.write((file_path.parent / GEN_DIR / kernel.filename).as_posix() + "\n")
+        f.write((file_path.parent / GEN_DIR / FMHA_FWD_API_FILENAME).as_posix() + "\n")
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py
@@ -0,0 +1,519 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+# generate kernel instances to speed up compilation
+import copy
+import fnmatch
+import itertools
+from collections import OrderedDict
+from dataclasses import dataclass
+from pathlib import Path
+from typing import List, Optional, Tuple
+
+from codegen.arch import ArchTrait, get_factories_for_targets
+from codegen.cmake_config import GEN_DIR
+from codegen.cpp_symbol_map import (
+    FWD_DTYPE_MAP,
+    BOOL_MAP,
+    ROPE_MAP,
+    LAYOUT_MAP,
+    ROPE_CHECK_MAP,
+)
+from codegen.utils import check_duplicates_and_paddings, if_, indent, update_file
+
+from codegen.ops.fmha_fwd import (
+    FMHA_FWD_KERNEL_HEADER,
+    FMHA_FWD_API_PER_ARCH,
+    FMHA_FWD_API_PER_DTYPE,
+    FMHA_FWD_API_PER_HDIM_CASE,
+)
+
+
+FMHA_FWD_APPENDKV_KERNEL_BODY = """
+#include <iostream>
+
+#if !defined(__HIP_DEVICE_COMPILE__) || ({F_arch.preprocessor_check})
+
+using fmha_dtype_{F_idx} = {F_dtype};
+
+using fmha_trait_{F_idx} = ck_tile::TileFmhaFwdAppendKVTraits<{F_spad},
+                                                    {F_skpad},
+                                                    {F_dpad},
+                                                    {F_dvpad},
+                                                    {F_occupancy}>;
+
+using fmha_pipeline_problem_{F_idx} = ck_tile::BlockFmhaFwdAppendKVPipelineProblem<
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::QDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::KDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::VDataType,
+    {F_bs},
+    {F_bsk},
+    {F_bd},
+    {F_bdv},
+    {F_vlayout},
+    {F_rope},
+    {F_pagedkv},
+    fmha_trait_{F_idx}>;
+
+using fmha_pipeline_{F_idx} = ck_tile::BlockFmhaFwdAppendKVPipeline<
+    fmha_pipeline_problem_{F_idx}>;
+
+using fmha_kernel_{F_idx} = ck_tile::FmhaFwdAppendKVKernel<fmha_pipeline_{F_idx}>;
+
+using trait_{F_idx} = fmha_fwd_appendkv_traits_<{F_hdim}, {F_dtype}, {F_bs}, {F_bsk}, {F_bd}, {F_bdv}, {F_vlayout},
+                        {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_rope}, {F_pagedkv}>;
+
+template<>
+float fmha_fwd_appendkv_<trait_{F_idx}, {F_arch.tag}>(const ck_tile::stream_config& s, fmha_fwd_appendkv_args a)
+{{
+    using k_ = fmha_kernel_{F_idx};
+    if(s.log_level_ > 0)
+        std::cout << ", " << k_::GetName() << std::flush;
+    auto [kargs, grids] = fmha_fwd_appendkv_create_kargs_and_grids<k_>(a);
+    const dim3 blocks                      = k_::BlockSize();
+    constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu;
+    return ck_tile::launch_kernel(s, ck_tile::make_kernel<kBlockPerCu, {F_arch.tag}>(k_{{}}, grids, blocks, 0, kargs));
+}}
+
+#endif // !defined(__HIP_DEVICE_COMPILE__) || ({F_arch.preprocessor_check})
+"""
+
+FMHA_FWD_APPENDKV_API_FILENAME = "fmha_fwd_appendkv_api.cpp"
+FMHA_FWD_APPENDKV_API = """
+float fmha_fwd_appendkv(fmha_fwd_appendkv_traits t, fmha_fwd_appendkv_args a, const ck_tile::stream_config& s) {{
+    float r = -1;
+
+    [[maybe_unused]] const std::string device_name = ck_tile::get_device_name();
+
+{F_dispatch}
+    return r;
+}}
+"""
+
+FMHA_FWD_APPENDKV_API_INNER_DISPATCH = """{F_if}((t.is_v_rowmajor == {F_vlayout}) &&
+        ({F_scheck}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck}) && (t.rope_type == {F_rope_check}) &&
+        ((a.block_table_ptr != nullptr) == {F_pagedkv})) {{
+    using trait_ = fmha_fwd_appendkv_traits_<{F_hdim}, {F_dtype}, {F_bs}, {F_bsk}, {F_bd}, {F_bdv}, {F_vlayout}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_rope}, {F_pagedkv}>;
+    return fmha_fwd_appendkv_<trait_, {F_arch.tag}>(s, a);
+}}
+"""
+
+
+@dataclass
+class FmhaFwdAppendKVApiTrait:
+    arch: ArchTrait
+    # sync with fmha_fwd_appendkv_traits, to generate fallback calls
+    hdim: str
+    dtype: str  # data type
+    bs: int  # tile size along q seqlen
+    bsk: int  # tile size along k seqlen
+    bd: int  # tile size along qk gemm unroll
+    bdv: int  # tile size along kv gemm unroll
+    vlayout: str
+    spad: str
+    skpad: str
+    dpad: str
+    dvpad: str
+    rope: str  # key from ROPE_MAP
+    pagedkv: str
+
+    @property
+    def name(self) -> str:
+        return (
+            f"{self.hdim}-{self.dtype}-{self.bs}-{self.bsk}-{self.bd}-{self.bdv}-{self.vlayout}-"
+            + f"{self.spad}-{self.skpad}-{self.dpad}-{self.dvpad}-{self.rope}-{self.pagedkv}"
+        )
+
+    @property
+    def scheck(self) -> str:
+        if self.spad == "t":
+            return f"true /*a.seqlen_q % {self.bs} != 0*/"
+        else:
+            return f"a.seqlen_q % {self.bs} == 0"
+
+    @property
+    def skcheck(self) -> str:
+        # we do not check all the values in a.seqlen_k_ptr
+        return "true"
+
+    @property
+    def dcheck(self) -> str:
+        if self.dpad == "t":
+            return f"true /*a.hdim_q % {self.bd} != 0*/"  # TODO: order of get_pipelines() matters! (ugly)
+        else:
+            return f"a.hdim_q % {self.bd} == 0"
+
+    @property
+    def dvcheck(self) -> str:
+        if self.dvpad == "t":
+            return f"true /*a.hdim_v % {self.bdv} != 0*/"  # TODO: order of get_pipelines() matters! (ugly)
+        else:
+            return f"a.hdim_v % {self.bdv} == 0"
+
+
+@dataclass
+class FmhaFwdAppendKVPipeline:
+    F_vlayout: str  # row/col
+    F_spad: str  # true/false
+    F_skpad: str  #
+    F_dpad: str  #
+    F_dvpad: str  #
+    F_rope: str  # key from ROPE_MAP
+    F_pagedkv: str  # t/f
+
+    @property
+    def name(self) -> str:
+        def pad_name() -> str:
+            n = ""
+            if self.F_spad == "t":
+                n += "s"
+            if self.F_skpad == "t":
+                n += "sk"
+            if self.F_dpad == "t":
+                n += "d"
+            if self.F_dvpad == "t":
+                n += "dv"
+            if n != "":
+                n = "p" + n
+            return n
+
+        pn = pad_name()
+        n = f"v{self.F_vlayout[0]}"
+        if pn != "":
+            n += f"_{pn}"
+        if self.F_rope != "no":
+            n += f"_{self.F_rope}"
+        if self.F_pagedkv == "t":
+            n += "_pagedkv"
+        return n
+
+
+class FmhaFwdAppendKVApiPool:
+    def __init__(self, mask_impl):
+        self.pool = OrderedDict()
+        self.mask_impl = mask_impl
+
+    def register_traits(self, trait: FmhaFwdAppendKVApiTrait) -> None:
+        hdim = trait.hdim
+        ts = (
+            self.pool.setdefault(trait.arch, OrderedDict())
+            .setdefault(trait.dtype, OrderedDict())
+            .setdefault(hdim, [])
+        )
+        check_duplicates_and_paddings(ts, trait)
+        ts.append(copy.copy(trait))
+
+    @property
+    def api(self) -> str:
+        per_arch = str()
+        for i_arch, (arch, pool_by_arch) in enumerate(self.pool.items()):
+            per_dtypes = str()
+            for i_dtype, (dtype, pool_by_dtype) in enumerate(pool_by_arch.items()):
+                per_hdim_case = str()
+                for i_hdim, (hdim, pool_by_hdim) in enumerate(pool_by_dtype.items()):
+                    inners = str()
+                    for i_trait, trait in enumerate(pool_by_hdim):
+                        inners += FMHA_FWD_APPENDKV_API_INNER_DISPATCH.format(
+                            F_if=if_(i_trait),
+                            F_arch=arch,
+                            F_vlayout=LAYOUT_MAP[trait.vlayout],
+                            F_scheck=trait.scheck,
+                            F_skcheck=trait.skcheck,
+                            F_dcheck=trait.dcheck,
+                            F_dvcheck=trait.dvcheck,
+                            F_rope_check=ROPE_CHECK_MAP[trait.rope],
+                            F_pagedkv=BOOL_MAP[trait.pagedkv],
+                            F_spad=BOOL_MAP[trait.spad],
+                            F_skpad=BOOL_MAP[trait.skpad],
+                            F_dpad=BOOL_MAP[trait.dpad],
+                            F_dvpad=BOOL_MAP[trait.dvpad],
+                            F_rope=ROPE_MAP[trait.rope],
+                            F_bs=trait.bs,
+                            F_bsk=trait.bsk,
+                            F_bd=trait.bd,
+                            F_bdv=trait.bdv,
+                            F_hdim=hdim,
+                            F_dtype=FWD_DTYPE_MAP[dtype],
+                        )
+                    per_hdim_case += FMHA_FWD_API_PER_HDIM_CASE.format(
+                        F_if=if_(i_hdim),
+                        F_hdim=hdim,
+                        F_hdim_v=hdim,
+                        F_inner_dispatch=indent(inners),
+                    )
+                per_dtypes += FMHA_FWD_API_PER_DTYPE.format(
+                    F_if=if_(i_dtype), F_dtype=dtype, F_hdim_case=indent(per_hdim_case)
+                )
+            per_arch += FMHA_FWD_API_PER_ARCH.format(
+                F_if=if_(i_arch),
+                F_arch=arch,
+                F_dtype_case=indent(per_dtypes),
+            )
+        if not per_arch:
+            # empty string we add some ignore to suppress warning in api
+            per_arch = "(void)t; (void)s; (void)a;"
+        return FMHA_FWD_KERNEL_HEADER + FMHA_FWD_APPENDKV_API.format(
+            F_dispatch=indent(per_arch)
+        )
+
+
+@dataclass
+class FmhaFwdAppendKVTileSize:
+    F_bs: int  # tile size along q seqlen
+    F_bsk: int  # tile size along k seqlen
+    F_bd: int  # tile size along qk gemm unroll
+    F_bdv: int  # tile size along kv gemm unroll
+    F_occupancy: int  # occupancy, -1 will let pipeline decide the occupancy, other value will overwrite occupancy
+
+    @property
+    def name(self) -> str:
+        return f"b{self.F_bs}x{self.F_bsk}x{self.F_bd}x{self.F_bdv}" + (
+            "" if self.F_occupancy == -1 else f"_o{self.F_occupancy}"
+        )
+
+
+@dataclass
+class FmhaFwdAppendKVKernel:
+    F_arch: ArchTrait
+    F_idx: int  # this is not a tunable, but a counter to differentiate symbol
+    F_hdim: int  # hdim
+    F_dtype: str  # data type
+    F_tile: FmhaFwdAppendKVTileSize
+    F_pipeline: FmhaFwdAppendKVPipeline
+    mask_impl: str
+
+    @property
+    def template(self) -> str:
+        return FMHA_FWD_KERNEL_HEADER + FMHA_FWD_APPENDKV_KERNEL_BODY.format(
+            F_idx=self.F_idx,
+            F_arch=self.F_arch,
+            F_hdim=self.F_hdim,
+            F_dtype=FWD_DTYPE_MAP[self.F_dtype],
+            F_bs=self.F_tile.F_bs,
+            F_bsk=self.F_tile.F_bsk,
+            F_bd=self.F_tile.F_bd,
+            F_bdv=self.F_tile.F_bdv,
+            F_vlayout=LAYOUT_MAP[self.F_pipeline.F_vlayout],
+            F_spad=BOOL_MAP[self.F_pipeline.F_spad],
+            F_skpad=BOOL_MAP[self.F_pipeline.F_skpad],
+            F_dpad=BOOL_MAP[self.F_pipeline.F_dpad],
+            F_dvpad=BOOL_MAP[self.F_pipeline.F_dvpad],
+            F_rope=ROPE_MAP[self.F_pipeline.F_rope],
+            F_pagedkv=BOOL_MAP[self.F_pipeline.F_pagedkv],
+            F_occupancy=self.F_tile.F_occupancy,
+        )
+
+    @property
+    def name(self) -> str:
+        # TODO: we don't encode idx here
+        return (
+            f"fmha_fwd_appendkv_d{self.F_hdim}_{self.F_dtype}_"
+            + self.F_tile.name
+            + "_"
+            + self.F_pipeline.name
+        )
+
+    @property
+    def filename(self) -> str:
+        return f"{self.name}{self.F_arch.filename_suffix}.cpp"
+
+    def api_trait(self) -> FmhaFwdAppendKVApiTrait:
+        return FmhaFwdAppendKVApiTrait(
+            arch=self.F_arch,
+            hdim=str(self.F_hdim),
+            dtype=self.F_dtype,
+            bs=self.F_tile.F_bs,
+            bsk=self.F_tile.F_bsk,
+            bd=self.F_tile.F_bd,
+            bdv=self.F_tile.F_bdv,
+            vlayout=self.F_pipeline.F_vlayout,
+            spad=self.F_pipeline.F_spad,
+            skpad=self.F_pipeline.F_skpad,
+            dpad=self.F_pipeline.F_dpad,
+            dvpad=self.F_pipeline.F_dvpad,
+            rope=self.F_pipeline.F_rope,
+            pagedkv=self.F_pipeline.F_pagedkv,
+        )
+
+
+class KernelComponentFactoryBase:
+    @staticmethod
+    def get_hdim_tile_size_dict(dtype: str) -> Optional[dict]:
+        if dtype in ["fp16", "bf16"]:
+            return {
+                "32": FmhaFwdAppendKVTileSize(64, 64, 32, 32, -1),
+                "64": FmhaFwdAppendKVTileSize(64, 64, 64, 64, -1),
+                "128": FmhaFwdAppendKVTileSize(64, 64, 128, 128, -1),
+                "256": FmhaFwdAppendKVTileSize(64, 64, 256, 256, -1),
+            }
+        elif dtype in ["fp8", "bf8"]:
+            return {
+                "64": FmhaFwdAppendKVTileSize(64, 64, 64, 64, -1),
+                "128": FmhaFwdAppendKVTileSize(64, 64, 128, 128, -1),
+                "256": FmhaFwdAppendKVTileSize(64, 64, 256, 256, -1),
+            }
+        else:
+            return None
+
+    @staticmethod
+    def get_pipelines(dtype, hdim) -> List[FmhaFwdAppendKVPipeline]:
+        # this function will populate a list possible pipelines
+        # TODO: the order of List matters! the later in this list will be also be checked later
+        # TODO: currently for qr pipeline, let 't' padding to appear later!!
+        # TODO: how to design this more generic?
+        pipelines = []
+        if dtype in ["fp16", "bf16"]:
+            # NOTICE: it will be very complicated if we consider all the hdim_q padding cases while
+            #         applying rotary embedding, so I just use 't' in inter/half pipelines
+            for vlayout, pagedkv in itertools.product(["row"], ["t", "f"]):
+                pipelines.append(FmhaFwdAppendKVPipeline(vlayout, "f", "t", "f", "f", "no", pagedkv))  # fmt: skip
+                pipelines.append(FmhaFwdAppendKVPipeline(vlayout, "t", "t", "t", "t", "no", pagedkv))  # fmt: skip
+
+                pipelines.append(FmhaFwdAppendKVPipeline(vlayout, "f", "t", "t", "f", "inter", pagedkv))  # fmt: skip
+                pipelines.append(FmhaFwdAppendKVPipeline(vlayout, "t", "t", "t", "t", "inter", pagedkv))  # fmt: skip
+
+                pipelines.append(FmhaFwdAppendKVPipeline(vlayout, "f", "t", "t", "f", "half", pagedkv))  # fmt: skip
+                pipelines.append(FmhaFwdAppendKVPipeline(vlayout, "t", "t", "t", "t", "half", pagedkv))  # fmt: skip
+        elif dtype in ["fp8", "bf8"]:
+            # rope/paged-kv is not supported
+            pipelines.append(FmhaFwdAppendKVPipeline("row", "t", "t", "t", "t", "no", "f"))  # fmt: skip
+        elif dtype in ["fp8fp16", "fp8bf16"]:
+            # TODO
+            None
+        else:
+            assert False
+        return pipelines
+
+
+class KernelComponentFactoryGfx9(KernelComponentFactoryBase):
+    arch = ArchTrait("gfx9")
+
+
+class KernelComponentFactoryGfx11(KernelComponentFactoryBase):
+    arch = ArchTrait("gfx11")
+
+    @staticmethod
+    def get_hdim_tile_size_dict(dtype: str) -> Optional[dict]:
+        if dtype in ["fp16", "bf16"]:
+            return KernelComponentFactoryBase.get_hdim_tile_size_dict(dtype)
+        return None
+
+    @staticmethod
+    def get_pipelines(dtype, hdim) -> List[FmhaFwdAppendKVPipeline]:
+        if dtype in ["fp16", "bf16"]:
+            return KernelComponentFactoryBase.get_pipelines(dtype, hdim)
+        return []
+
+
+class KernelComponentFactoryGfx12(KernelComponentFactoryBase):
+    arch = ArchTrait("gfx12")
+
+
+def get_factory(target: str):
+    # Place more specific architectures first
+
+    if target.startswith("gfx9"):
+        return KernelComponentFactoryGfx9
+
+    if target.startswith("gfx11"):
+        return KernelComponentFactoryGfx11
+    if target.startswith("gfx12"):
+        return KernelComponentFactoryGfx12
+
+    raise Exception(f"Unsupported device target {target}")
+
+
+def get_fwd_appendkv_blobs(
+    targets: List[str], kernel_filter: Optional[str], receipt, mask_impl, optdim_list
+) -> Tuple[FmhaFwdAppendKVApiPool, List[FmhaFwdAppendKVKernel]]:
+    gen = list()
+    api_pool = FmhaFwdAppendKVApiPool(mask_impl)
+
+    factories = get_factories_for_targets(targets, get_factory)
+
+    for factory, dtype in itertools.product(factories, FWD_DTYPE_MAP.keys()):
+        d = factory.get_hdim_tile_size_dict(dtype)
+        if d is None:
+            continue
+        for hdim_str in d.keys():
+            tile = d[hdim_str]
+            hdim = int(hdim_str)
+            for pipeline in factory.get_pipelines(dtype, hdim):
+                k = FmhaFwdAppendKVKernel(
+                    F_arch=factory.arch,
+                    F_idx=0,
+                    F_hdim=hdim,
+                    F_dtype=dtype,
+                    F_tile=tile,
+                    F_pipeline=pipeline,
+                    mask_impl=mask_impl,
+                )
+                if kernel_filter != "":
+                    if not fnmatch.fnmatch(k.name, kernel_filter):
+                        continue
+                if optdim_list != [-1]:
+                    if hdim not in optdim_list:
+                        continue
+                # 2 - Flash attention integration
+                if receipt == 2:
+                    cond = dtype in ["fp16", "bf16"]
+                    cond &= pipeline.F_vlayout == "row"
+                    if not cond:
+                        continue
+                # PyTorch integration
+                elif receipt == 4:
+                    cond = dtype in ["fp16", "bf16"]
+                    cond &= pipeline.F_vlayout == "row"
+                    if not cond:
+                        continue
+
+                # fp32 only
+                if receipt == 800 or receipt == 801:
+                    cond = dtype == "fp32"
+                    if not cond:
+                        continue
+
+                api_pool.register_traits(k.api_trait())
+                gen.append(k)
+
+    return (api_pool, gen)
+
+
+def write_single_kernel(kernel: FmhaFwdAppendKVKernel, autogen_dir: Path) -> None:
+    update_file(autogen_dir / kernel.filename, kernel.template)
+
+
+def write_fwd_appendkv_api(api_pool: FmhaFwdAppendKVApiPool, autogen_dir: Path) -> None:
+    update_file(autogen_dir / FMHA_FWD_APPENDKV_API_FILENAME, api_pool.api)
+
+
+def write_blobs(
+    targets: List[str],
+    output_dir: Path,
+    kernel_filter: Optional[str],
+    receipt,
+    optdim_list,
+    mask_impl,
+) -> None:
+    api_pool, kernels = get_fwd_appendkv_blobs(
+        targets, kernel_filter, receipt, mask_impl, optdim_list
+    )
+    for kernel in kernels:
+        write_single_kernel(kernel, output_dir)
+    write_fwd_appendkv_api(api_pool, output_dir)
+
+
+def list_blobs(
+    targets: List[str],
+    file_path: Path,
+    kernel_filter: Optional[str],
+    receipt,
+    optdim_list,
+    mask_impl,
+) -> None:
+    with file_path.open("a") as f:
+        _, kernels = get_fwd_appendkv_blobs(
+            targets, kernel_filter, receipt, mask_impl, optdim_list
+        )
+        for kernel in kernels:
+            f.write((file_path.parent / GEN_DIR / kernel.filename).as_posix() + "\n")
+        f.write((file_path.parent / GEN_DIR / FMHA_FWD_APPENDKV_API_FILENAME).as_posix() + "\n")
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_pagedkv_prefill.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_pagedkv_prefill.py
@@ -0,0 +1,799 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+# generate kernel instances to speed up compilation
+import copy
+import fnmatch
+import itertools
+from collections import OrderedDict
+from dataclasses import dataclass
+from pathlib import Path
+from typing import List, Optional, Tuple
+
+from codegen.arch import ArchTrait, get_factories_for_targets
+from codegen.cmake_config import GEN_DIR
+from codegen.cpp_symbol_map import (
+    LAYOUT_MAP,
+    BIAS_CHECK_MAP,
+    get_mask_check_map,
+    MODE_MAP,
+    get_mask_map,
+    BIAS_MAP,
+    FWD_DTYPE_MAP,
+    BOOL_MAP,
+    PIPELINE_ENUM_MAP,
+)
+from codegen.utils import check_duplicates_and_paddings, if_, indent, update_file
+
+from codegen.ops.fmha_fwd import (
+    DTYPE_BITS,
+    K0_MAX_SUBMAX_MAP,
+    FMHA_FWD_KERNEL_HEADER,
+    FMHA_FWD_API_PER_ARCH,
+    FMHA_FWD_API_PER_DTYPE,
+    FMHA_FWD_API_PER_HDIM_CASE,
+)
+
+
+FMHA_FWD_PAGEDKV_PIPELINE_MAP = {
+    "qr_pagedkv": "ck_tile::BlockFmhaFwdPagedKVPipelineQRKSVS"
+}
+
+FMHA_FWD_KERNEL_BODY = """
+#include <iostream>
+
+#if !defined(__HIP_DEVICE_COMPILE__) || ({F_arch.preprocessor_check})
+
+using fmha_dtype_{F_idx} = {F_dtype};
+
+using fmha_block_tile_{F_idx} = ck_tile::sequence<{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}>;
+
+using fmha_shape_{F_idx} = ck_tile::TileFmhaShape<fmha_block_tile_{F_idx},
+                                      ck_tile::sequence<{F_rm0}, {F_rn0}, {F_rk0}>,
+                                      ck_tile::sequence<{F_wm0}, {F_wn0}, {F_wk0}>,
+                                      ck_tile::sequence<{F_rm1}, {F_rn1}, {F_rk1}>,
+                                      ck_tile::sequence<{F_wm1}, {F_wn1}, {F_wk1}>,
+                                      {F_vlayout}>;
+
+using fmha_trait_{F_idx} = ck_tile::TileFmhaFwdPagedKVTraits<{F_spad},
+                                                             {F_skpad},
+                                                             {F_dpad},
+                                                             {F_dvpad},
+                                                             {F_logits},
+                                                             {F_bias},
+                                                             false,
+                                                             {F_lse},      //lse
+                                                             {F_pagedkv},  //pagedkv
+                                                             {F_squant},
+                                                             {F_occupancy},
+                                                             {F_skip},
+                                                             {F_sink}>;
+
+using fmha_variant_{F_idx} = ck_tile::ComposedAttention<{F_logits} * ck_tile::LOGITS_SOFT_CAP, CK_TILE_FMHA_FWD_FAST_EXP2>;
+
+using fmha_mask_{F_idx} = {F_mask};
+
+using fmha_pipeline_problem_{F_idx} = ck_tile::BlockFmhaFwdPagedKVPipelineProblem<
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::QDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::KDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::VDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::SaccDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::SMPLComputeDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::BiasDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::LSEDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::PDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::OaccDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::ODataType,
+    fmha_shape_{F_idx},
+    {F_mode},
+    fmha_variant_{F_idx},
+    fmha_mask_{F_idx},
+    fmha_trait_{F_idx}>;
+
+using fmha_pipeline_{F_idx} = {F_pipeline}<
+    fmha_pipeline_problem_{F_idx}>;
+
+using fmha_epilogue_{F_idx} =
+    ck_tile::Default2DEpilogue<ck_tile::Default2DEpilogueProblem<typename FmhaFwdTypeConfig<{F_dtype}>::OaccDataType,
+                                           typename FmhaFwdTypeConfig<{F_dtype}>::ODataType,
+                                           {F_spad}, {F_dvpad}>>;
+
+using fmha_kernel_{F_idx} =
+    ck_tile::FmhaFwdPagedKVKernel<fmha_pipeline_{F_idx}, fmha_epilogue_{F_idx}>;
+
+using trait_{F_idx} = fmha_fwd_pagedkv_traits_<{F_hdim}, {F_dtype}, {F_mode},{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout},
+                        {F_pipeline_enum}, {F_logits}, fmha_mask_{F_idx}, {F_bias}, {F_lse}, {F_pagedkv}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_skip}, {F_sink}>;
+
+template<>
+float fmha_fwd_pagedkv_<trait_{F_idx}, {F_arch.tag}>(const ck_tile::stream_config& s, fmha_fwd_pagedkv_args a)
+{{
+    using k_ = fmha_kernel_{F_idx};
+    if(s.log_level_ > 0)
+        std::cout << ", " << k_::GetName() << std::flush;
+    auto [kargs, grids] = fmha_fwd_pagedkv_create_kargs_and_grids<k_>(a);
+    const dim3 blocks                      = k_::BlockSize();
+    constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu;
+    return ck_tile::launch_kernel(s, ck_tile::make_kernel<kBlockPerCu, {F_arch.tag}>(k_{{}}, grids, blocks, 0, kargs));
+}}
+
+#endif // !defined(__HIP_DEVICE_COMPILE__) || ({F_arch.preprocessor_check})
+"""
+
+FMHA_FWD_API_FILENAME = "fmha_fwd_pagedkv_api.cpp"
+FMHA_FWD_API = """
+float fmha_fwd_pagedkv(fmha_fwd_pagedkv_traits& t, fmha_fwd_pagedkv_args& a, const ck_tile::stream_config& s) {{
+    float r = -1;
+
+    [[maybe_unused]] const std::string device_name = ck_tile::get_device_name();
+
+{F_dispatch}
+    return r;
+}}
+"""
+
+FMHA_FWD_API_INNER_DISPATCH = """{F_if}((t.is_group_mode == {F_mode}) && (t.is_v_rowmajor == {F_vlayout}) && (t.has_logits_soft_cap == {F_logits}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.has_lse == {F_lse})  && (t.use_pagedkv == {F_pagedkv}) && (t.do_fp8_static_quant == {F_squant}) && (t.skip_min_seqlen_q == {F_skip}) && (t.has_sink == {F_sink}) &&
+        ({F_scheck}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck})) {{
+    using trait_ = fmha_fwd_pagedkv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, {F_logits}, {F_mask}, {F_bias}, {F_lse}, {F_pagedkv}, {F_squant}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}, {F_skip},{F_sink}>;
+    return fmha_fwd_pagedkv_<trait_, {F_arch.tag}>(s, a);
+}}
+"""
+
+
+@dataclass
+class FmhaFwdApiTrait:
+    arch: ArchTrait
+    pipeline_tag: str
+    # sync with fmha_fwd_traits<>, to generate fallback calls
+    hdim: str
+    dtype: str  # data type
+    mode: str  # value from MODE_MAP
+    bm0: int  # tile size along q seqlen (block size)
+    bn0: int  # tile size along qk seqlen
+    bk0: int  # tile size along qk gemm unroll
+    bn1: int  # tile size along v head_dim
+    bk1: int  # tile size along kv gemm unroll
+    bk0max: int
+    vlayout: str
+    logits: str
+    mask: str
+    bias: str  #
+    lse: str  #
+    pagedkv: str
+    squant: str  #
+    spad: str
+    skpad: str
+    dpad: str
+    dvpad: str
+    skip: str
+    sink: str
+
+    @property
+    def name(self) -> str:
+        return (
+            f"{self.hdim}-{self.dtype}-{self.mode}-{self.bm0}-{self.bn0}-{self.bk0}-{self.bn0}-{self.bk1}-{self.bk0max}-"
+            + f"{self.vlayout}-{self.logits}-{self.mask}-{self.bias}-{self.lse}-{self.pagedkv}-{self.squant}-{self.spad}-{self.skpad}-{self.dpad}-{self.dvpad}-{self.skip}-{self.sink}"
+        )
+
+    @property
+    def scheck(self) -> str:
+        if self.mode == "group":
+            return "true/*group mode spad always true*/"  # group mode only generate spad/skpad == true
+        if self.pipeline_tag == "qr_async":
+            if self.spad == "t":
+                return "true"  # always support
+            else:
+                return "true"
+        elif self.pipeline_tag in ["qr_pagedkv", "qs"]:
+            if self.spad == "t":
+                return f"true /*a.seqlen_q % {self.bm0} != 0*/"  # TODO: order of get_pipelines() matters! (ugly)
+            else:
+                return f"a.seqlen_q % {self.bm0} == 0"
+        else:
+            assert False
+
+    @property
+    def skcheck(self) -> str:
+        if self.mode == "group":
+            return "true/*group mode skpad always true*/"  # group mode only generate spad/skpad == true
+        if self.pipeline_tag == "qr_async":
+            if self.skpad == "t":
+                return f"a.seqlen_k == 0 || a.seqlen_k % {self.bn0} != 0"
+            else:
+                return f"a.seqlen_k != 0 && a.seqlen_k % {self.bn0} == 0"
+        elif self.pipeline_tag in ["qr_pagedkv", "qs"]:
+            if self.skpad == "t":
+                return f"true /*a.seqlen_k_ptr != nullptr || a.seqlen_k % {self.bn0} != 0*/"  # TODO: order of get_pipelines() matters! (ugly)
+            else:
+                return f"a.seqlen_k_ptr == nullptr && a.seqlen_k % {self.bn0} == 0"
+        else:
+            assert False
+
+    @property
+    def dcheck(self) -> str:
+        if self.pipeline_tag == "qr_async":
+            vec = int((32 * 4) / DTYPE_BITS[self.dtype])
+            if self.dpad == "t":
+                return f"a.hdim_q % {vec} == 0"
+            else:
+                assert False
+        elif self.pipeline_tag in ["qr_pagedkv", "qs"]:
+            bk0submax = K0_MAX_SUBMAX_MAP[self.bk0max]
+            if self.dpad == "t":
+                return f"true /*a.hdim_q % {bk0submax} != 0*/"  # TODO: order of get_pipelines() matters! (ugly)
+            else:
+                return f"a.hdim_q % {bk0submax} == 0"
+        else:
+            assert False
+
+    @property
+    def dvcheck(self) -> str:
+        if self.pipeline_tag == "qr_async":
+            vec = int((32 * 4) / DTYPE_BITS[self.dtype])
+            if self.dvpad == "t":
+                return f"a.hdim_v % {vec} == 0"
+            else:
+                assert False
+        elif self.pipeline_tag in ["qr_pagedkv", "qs"]:
+            bk0submax = K0_MAX_SUBMAX_MAP[self.bk0max]
+            if self.dvpad == "t":
+                return f"true /*a.hdim_v % {bk0submax} != 0*/"  # TODO: order of get_pipelines() matters! (ugly)
+            else:
+                return f"a.hdim_v % {bk0submax} == 0"
+        else:
+            assert False
+
+
+@dataclass
+class FmhaFwdPipeline:
+    tag: str
+
+    F_vlayout: str  # row/col
+    F_spad: str  # true/false
+    F_skpad: str  #
+    F_dpad: str  #
+    F_dvpad: str  #
+    F_logits: str  # t/f
+    F_bias: str  # true/false
+    F_lse: str  #
+    F_pagedkv: str  #
+    F_squant: str  #
+    F_mask: str  # value from MASK_MAP
+    F_skip: str  # true/false
+    F_sink: str  # true/false
+
+    @property
+    def name(self) -> str:
+        def pad_name() -> str:
+            n = ""
+            if self.F_spad == "t":
+                n += "s"
+            if self.F_skpad == "t":
+                n += "sk"
+            if self.F_dpad == "t":
+                n += "d"
+            if self.F_dvpad == "t":
+                n += "dv"
+            if n != "":
+                n = "p" + n
+            return n
+
+        pn = pad_name()
+        n = f"{self.tag}_v{self.F_vlayout[0]}"
+        if pn != "":
+            n += f"_{pn}"
+        else:
+            n += "_npad"
+
+        if self.F_logits == "t":
+            n += "_logits"
+        else:
+            n += "_nlogits"
+
+        if self.F_bias != "no":
+            n += f"_{self.F_bias}"
+        else:
+            n += "_nbias"
+
+        if self.F_mask[0:2] == "s_":
+            if self.F_mask == "s_mask":
+                n += "_mask"
+            else:
+                n += "_nmask"
+        else:
+            if self.F_mask != "no":
+                n += f"_m{self.F_mask[0]}"
+            else:
+                n += "_nmask"
+
+        if self.F_lse == "t":
+            n += "_lse"
+        else:
+            n += "_nlse"
+
+        if self.F_skip == "t":
+            n += "_skip"
+        else:
+            n += "_nskip"
+
+        if self.F_squant == "t":
+            n += "_squant"
+        else:
+            n += "_nsquant"
+
+        if self.F_pagedkv == "t":
+            n += "_pagedkv"
+        else:
+            n += "_npagedkv"
+        if self.F_sink == "t":
+            n += "_sink"
+        else:
+            n += "_nsink"
+
+        return n
+
+
+class FmhaFwdApiPool:
+    def __init__(self, mask_impl):
+        self.pool = OrderedDict()
+        self.mask_impl = mask_impl
+
+    def register_traits(self, trait: FmhaFwdApiTrait) -> None:
+        hdim = trait.hdim
+        ts = (
+            self.pool.setdefault(trait.arch, OrderedDict())
+            .setdefault(trait.dtype, OrderedDict())
+            .setdefault(hdim, [])
+        )
+        check_duplicates_and_paddings(ts, trait)
+        ts.append(copy.copy(trait))
+
+    @property
+    def api(self) -> str:
+        per_arch = str()
+        for i_arch, (arch, pool_by_arch) in enumerate(self.pool.items()):
+            per_dtypes = str()
+            for i_dtype, (dtype, pool_by_dtype) in enumerate(pool_by_arch.items()):
+                per_hdim_case = str()
+                for i_hdim, (hdim, pool_by_hdim) in enumerate(pool_by_dtype.items()):
+                    inners = str()
+                    for i_trait, trait in enumerate(pool_by_hdim):
+                        inners += FMHA_FWD_API_INNER_DISPATCH.format(
+                            F_if=if_(i_trait),
+                            F_arch=arch,
+                            F_mode=MODE_MAP[trait.mode],
+                            F_vlayout=LAYOUT_MAP[trait.vlayout],
+                            F_pipeline_enum=PIPELINE_ENUM_MAP[trait.pipeline_tag],
+                            F_logits=BOOL_MAP[trait.logits],
+                            F_mask=get_mask_map(self.mask_impl)[trait.mask],
+                            F_mask_check=get_mask_check_map(self.mask_impl)[trait.mask],
+                            F_bias_check=BIAS_CHECK_MAP[trait.bias],
+                            F_bias=BIAS_MAP[trait.bias],
+                            F_lse=BOOL_MAP[trait.lse],
+                            F_pagedkv=BOOL_MAP[trait.pagedkv],
+                            F_skip=BOOL_MAP[trait.skip],
+                            F_sink=BOOL_MAP[trait.sink],
+                            F_squant=BOOL_MAP[trait.squant],
+                            F_scheck=trait.scheck,
+                            F_skcheck=trait.skcheck,
+                            F_dcheck=trait.dcheck,
+                            F_dvcheck=trait.dvcheck,
+                            F_spad=BOOL_MAP[trait.spad],
+                            F_skpad=BOOL_MAP[trait.skpad],
+                            F_dpad=BOOL_MAP[trait.dpad],
+                            F_dvpad=BOOL_MAP[trait.dvpad],
+                            F_bm0=trait.bm0,
+                            F_bn0=trait.bn0,
+                            F_bk0=trait.bk0,
+                            F_bn1=trait.bn1,
+                            F_bk1=trait.bk1,
+                            F_bk0max=trait.bk0max,
+                            F_hdim=hdim,
+                            F_dtype=FWD_DTYPE_MAP[dtype],
+                        )
+                    per_hdim_case += FMHA_FWD_API_PER_HDIM_CASE.format(
+                        F_if=if_(i_hdim),
+                        F_hdim=hdim,
+                        F_hdim_v=trait.bn1,
+                        F_inner_dispatch=indent(inners),
+                    )
+                per_dtypes += FMHA_FWD_API_PER_DTYPE.format(
+                    F_if=if_(i_dtype), F_dtype=dtype, F_hdim_case=indent(per_hdim_case)
+                )
+            per_arch += FMHA_FWD_API_PER_ARCH.format(
+                F_if=if_(i_arch),
+                F_arch=arch,
+                F_dtype_case=indent(per_dtypes),
+            )
+        if not per_arch:
+            # empty string we add some ignore to suppress warning in api
+            per_arch = "(void)t; (void)s; (void)a;"
+        return FMHA_FWD_KERNEL_HEADER + FMHA_FWD_API.format(F_dispatch=indent(per_arch))
+
+
+@dataclass
+class FmhaFwdTileSize:
+    F_bm0: int  # tile size along q seqlen (block size)
+    F_bn0: int  # tile size along k seqlen
+    F_bk0: int  # tile size along qk gemm unroll
+    F_bn1: int  # tile size along v head_dim
+    F_bk1: int  # tile size along kv gemm unroll
+    F_bk0max: int  # total length of K0, used for pipeline that need load Q at once (or repeately load Q as a whole tile)
+    F_rm0: int  # number of warps for gemm0 along q seqlen
+    F_rn0: int  # number of warps for gemm0 along k seqlen
+    F_rk0: int  # number of warps for gemm0 along head dim q (not used)
+    F_rm1: int  # number of warps for gemm1 along q seqlen
+    F_rn1: int  # number of warps for gemm1 along head dim v
+    F_rk1: int  # number of warps for gemm1 along k seqlen (not used)
+    F_wm0: int  # gemm0 warp size along m
+    F_wn0: int  # gemm0 warp size along n
+    F_wk0: int  # gemm0 warp size along k
+    F_wm1: int  # gemm1 warp size along m
+    F_wn1: int  # gemm1 warp size along n
+    F_wk1: int  # gemm1 warp size along k
+    F_occupancy: int  # occupancy, -1 will let pipeline decide the occupancy, other value will overwrite occupancy
+
+    @property
+    def name(self) -> str:
+        return (
+            f"b{self.F_bm0}x{self.F_bn0}x{self.F_bk0}x{self.F_bn1}x{self.F_bk1}x{self.F_bk0max}"
+            + f"_r{self.F_rm0}x{self.F_rn0}x{self.F_rk0}_r{self.F_rm1}x{self.F_rn1}x{self.F_rk1}"
+            + f"_w{self.F_wm0}x{self.F_wn0}x{self.F_wk0}_w{self.F_wm1}x{self.F_wn1}x{self.F_wk1}"
+            + ("" if self.F_occupancy == -1 else f"_o{self.F_occupancy}")
+        )
+
+
+@dataclass
+class FmhaFwdKernel:
+    F_arch: ArchTrait
+    F_idx: int  # this is not a tunable, but a counter to differentiate symbol
+    F_hdim: int  # hdim
+    F_dtype: str  # data type
+    F_mode: str  # value from MODE_MAP
+    F_tile: FmhaFwdTileSize
+    F_pipeline: FmhaFwdPipeline
+    mask_impl: str
+
+    @property
+    def template(self) -> str:
+        return FMHA_FWD_KERNEL_HEADER + FMHA_FWD_KERNEL_BODY.format(
+            F_idx=self.F_idx,
+            F_arch=self.F_arch,
+            F_hdim=self.F_hdim,
+            F_dtype=FWD_DTYPE_MAP[self.F_dtype],
+            F_bm0=self.F_tile.F_bm0,
+            F_bn0=self.F_tile.F_bn0,
+            F_bk0=self.F_tile.F_bk0,
+            F_bn1=self.F_tile.F_bn1,
+            F_bk1=self.F_tile.F_bk1,
+            F_bk0max=self.F_tile.F_bk0max,
+            F_rm0=self.F_tile.F_rm0,
+            F_rn0=self.F_tile.F_rn0,
+            F_rk0=self.F_tile.F_rk0,
+            F_rm1=self.F_tile.F_rm1,
+            F_rn1=self.F_tile.F_rn1,
+            F_rk1=self.F_tile.F_rk1,
+            F_wm0=self.F_tile.F_wm0,
+            F_wn0=self.F_tile.F_wn0,
+            F_wk0=self.F_tile.F_wk0,
+            F_wm1=self.F_tile.F_wm1,
+            F_wn1=self.F_tile.F_wn1,
+            F_wk1=self.F_tile.F_wk1,
+            F_vlayout=LAYOUT_MAP[self.F_pipeline.F_vlayout],
+            F_spad=BOOL_MAP[self.F_pipeline.F_spad],
+            F_skpad=BOOL_MAP[self.F_pipeline.F_skpad],
+            F_dpad=BOOL_MAP[self.F_pipeline.F_dpad],
+            F_dvpad=BOOL_MAP[self.F_pipeline.F_dvpad],
+            F_logits=BOOL_MAP[self.F_pipeline.F_logits],
+            F_bias=BIAS_MAP[self.F_pipeline.F_bias],
+            F_lse=BOOL_MAP[self.F_pipeline.F_lse],
+            F_pagedkv=BOOL_MAP[self.F_pipeline.F_pagedkv],
+            F_squant=BOOL_MAP[self.F_pipeline.F_squant],
+            F_skip=BOOL_MAP[self.F_pipeline.F_skip],
+            F_sink=BOOL_MAP[self.F_pipeline.F_sink],
+            F_occupancy=self.F_tile.F_occupancy,
+            F_pipeline_enum=PIPELINE_ENUM_MAP[self.F_pipeline.tag],
+            F_mask=get_mask_map(self.mask_impl)[self.F_pipeline.F_mask],
+            F_mode=MODE_MAP[self.F_mode],
+            F_pipeline=FMHA_FWD_PAGEDKV_PIPELINE_MAP[self.F_pipeline.tag],
+        )
+
+    @property
+    def name(self) -> str:
+        # TODO: we don't encode idx here
+        return (
+            f"fmha_fwd_pagedkv_d{self.F_hdim}_{self.F_dtype}_{self.F_mode}_"
+            + self.F_tile.name
+            + "_"
+            + self.F_pipeline.name
+        )
+
+    @property
+    def filename(self) -> str:
+        return f"{self.name}{self.F_arch.filename_suffix}.cpp"
+
+    def api_trait(self) -> FmhaFwdApiTrait:
+        return FmhaFwdApiTrait(
+            arch=self.F_arch,
+            pipeline_tag=self.F_pipeline.tag,
+            hdim=str(self.F_hdim),
+            dtype=self.F_dtype,
+            mode=self.F_mode,
+            bm0=self.F_tile.F_bm0,
+            bn0=self.F_tile.F_bn0,
+            bk0=self.F_tile.F_bk0,
+            bn1=self.F_tile.F_bn1,
+            bk1=self.F_tile.F_bk1,
+            bk0max=self.F_tile.F_bk0max,
+            vlayout=self.F_pipeline.F_vlayout,
+            mask=self.F_pipeline.F_mask,
+            logits=self.F_pipeline.F_logits,
+            bias=self.F_pipeline.F_bias,
+            lse=self.F_pipeline.F_lse,
+            pagedkv=self.F_pipeline.F_pagedkv,
+            squant=self.F_pipeline.F_squant,
+            spad=self.F_pipeline.F_spad,
+            skpad=self.F_pipeline.F_skpad,
+            dpad=self.F_pipeline.F_dpad,
+            dvpad=self.F_pipeline.F_dvpad,
+            skip=self.F_pipeline.F_skip,
+            sink=self.F_pipeline.F_sink,
+        )
+
+
+class KernelComponentFactoryBase:
+    @staticmethod
+    def get_pipelines(dtype, hdim, mask_impl) -> List[FmhaFwdPipeline]:
+        # this function will populate a list possible pipelines
+        # TODO: the order of List matters! the later in this list will be also be checked later
+        # TODO: currently for qr_pagedkv pipeline, let "t" padding to appear later!!
+        # TODO: how to design this more generic?
+        squant = "t" if dtype == "fp8" else "f"
+        pipelines = []
+        if dtype in ["fp16", "bf16"]:
+            for logits, mask, bias, pagedkv, skip, sink in itertools.product(
+                ["t", "f"],
+                get_mask_map(mask_impl).keys(),
+                BIAS_MAP.keys(),
+                ["t"],
+                ["f"],
+                ["t", "f"],
+            ):
+                pipelines.append(FmhaFwdPipeline("qr_pagedkv", "row", "t", "f", "f", "f", logits, bias, "f", pagedkv, squant, mask, skip, sink))  # fmt: skip
+                pipelines.append(FmhaFwdPipeline("qr_pagedkv", "row", "t", "t", "f", "f", logits, bias, "f", pagedkv, squant, mask, skip, sink))  # fmt: skip
+        elif dtype in ["fp8", "bf8"]:
+            # no need lse/dropout kernels
+            for logits, mask, bias in itertools.product(
+                ["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys()
+            ):
+                pipelines.append(FmhaFwdPipeline("qr_pagedkv", "row", "f", "f", "f", "f", logits, bias, "f", "t", squant, mask, "f", "f"))  # fmt: skip
+                pipelines.append(FmhaFwdPipeline("qr_pagedkv", "row", "t", "t", "f", "f", logits, bias, "f", "t", squant, mask, "f", "f"))  # fmt: skip
+        elif dtype in ["fp8fp16", "fp8bf16"]:
+            pass  # TODO
+        else:
+            assert False
+        return pipelines
+
+
+class KernelComponentFactoryGfx9(KernelComponentFactoryBase):
+    arch = ArchTrait("gfx9")
+
+    @staticmethod
+    def get_hdim_tile_size_dict(dtype: str) -> Optional[dict]:
+        if dtype in ["fp16", "bf16"]:
+            return {
+                # "32":  FmhaFwdTileSize(128,  64, 16,  32, 32,  32,  2, 1, 1,  2, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
+                # "64":  FmhaFwdTileSize(128,  64, 32,  64, 32,  64,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
+                # "96":  FmhaFwdTileSize(128, 128, 32, 128, 32,  96,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
+                "128": FmhaFwdTileSize(128, 128, 32, 128, 32, 128,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
+                # "192": FmhaFwdTileSize(128, 128, 32, 128, 32, 192,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
+                # "256": FmhaFwdTileSize(128, 128, 32, 256, 32, 256,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
+            }  # fmt: skip
+        elif dtype in ["fp8", "bf8"]:
+            return {
+                "64":  FmhaFwdTileSize(128,  64, 32, 64,  32,  64,  2, 1, 1,  2, 1, 1,  32, 32, 32,  32, 32, 32,  -1),
+                "128": FmhaFwdTileSize(128, 128, 32, 128, 32, 128,  4, 1, 1,  4, 1, 1,  32, 32, 32,  32, 32, 32,  -1),
+                "256": FmhaFwdTileSize(128, 128, 32, 256, 32, 256,  4, 1, 1,  4, 1, 1,  32, 32, 32,  32, 32, 32,  -1),
+            }  # fmt: skip
+        else:
+            return None
+
+
+class KernelComponentFactoryGfx11(KernelComponentFactoryBase):
+    arch = ArchTrait("gfx11")
+
+    @staticmethod
+    def get_hdim_tile_size_dict(dtype: str) -> Optional[dict]:
+        if dtype in ["fp16", "bf16"]:
+            return {
+                #                      bm0, bn0, bk0, bn1, bk1,
+              # "32":  FmhaFwdTileSize( 64,  64,  16,  32,  32,   32,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
+              # "64":  FmhaFwdTileSize( 64,  64,  32,  64,  32,   64,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
+                "128": FmhaFwdTileSize( 64,  64,  32, 128,  32,  128,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
+              # "192": FmhaFwdTileSize( 64,  64,  32, 128,  32,  256,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
+              # "256": FmhaFwdTileSize( 64,  64,  32, 256,  32,  256,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
+            }  # fmt: skip
+        else:
+            return None
+
+
+class KernelComponentFactoryGfx12(KernelComponentFactoryBase):
+    arch = ArchTrait("gfx12")
+
+    @staticmethod
+    def get_hdim_tile_size_dict(dtype: str) -> Optional[dict]:
+        if dtype in ["fp16", "bf16"]:
+            return {
+                #                      bm0, bn0, bk0, bn1, bk1,
+              # "32":  FmhaFwdTileSize( 64,  64,  16,  32,  32,   32,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
+              # "64":  FmhaFwdTileSize( 64,  64,  32,  64,  32,   64,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
+                "128": FmhaFwdTileSize( 64,  64,  32, 128,  32,  128,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
+              # "192": FmhaFwdTileSize( 64,  64,  32, 128,  32,  256,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
+              # "256": FmhaFwdTileSize( 64,  64,  32, 256,  32,  256,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
+            }  # fmt: skip
+        elif dtype in ["fp8", "bf8"]:
+            return {
+                #                      bm0, bn0, bk0, bn1, bk1,
+                "64":  FmhaFwdTileSize(128,  64,  32,  64,  32,   64,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
+                "128": FmhaFwdTileSize( 64,  64,  32, 128,  32,  128,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
+                "256": FmhaFwdTileSize( 64,  32,  32, 256,  32,  256,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
+            }  # fmt: skip
+        else:
+            return None
+
+
+def get_factory(target: str):
+    # Place more specific architectures first
+
+    if target.startswith("gfx9"):
+        return KernelComponentFactoryGfx9
+
+    if target.startswith("gfx11"):
+        return KernelComponentFactoryGfx11
+    if target.startswith("gfx12"):
+        return KernelComponentFactoryGfx12
+
+    raise Exception(f"Unsupported device target {target}")
+
+
+def get_fwd_blobs(
+    targets: List[str], kernel_filter: Optional[str], receipt, optdim_list, mask_impl
+) -> Tuple[FmhaFwdApiPool, List[FmhaFwdKernel]]:
+    gen = list()
+    api_pool = FmhaFwdApiPool(mask_impl)
+
+    factories = get_factories_for_targets(targets, get_factory)
+
+    for factory, dtype in itertools.product(factories, FWD_DTYPE_MAP.keys()):
+        d = factory.get_hdim_tile_size_dict(dtype)
+        if d is None:
+            continue
+        for hdim_str, mode in itertools.product(d.keys(), MODE_MAP.keys()):
+            tile = d[hdim_str]
+            hdim = int(hdim_str)
+            for pipeline in factory.get_pipelines(dtype, hdim, mask_impl):
+                # if pipeline.F_pagedkv == "f":
+                #     continue
+                if mode == "group":
+                    if pipeline.F_spad != "t" or pipeline.F_skpad != "t":
+                        # in group mode, spad/skpad must be true, since we can't predict if seqlen of current batch need pad or not
+                        continue
+                if hdim == 192 and tile.F_bn1 == 128:
+                    # NOTE: this is used to speedup deepseek prefill case, we don't gen training
+                    if pipeline.F_bias != "no" or pipeline.F_lse == "t":
+                        continue
+                # logits_soft_cap is only allowed if no bias
+                if not (
+                    (pipeline.F_logits == "t" and pipeline.F_bias == "no")
+                    or pipeline.F_logits == "f"
+                ):
+                    continue
+                k = FmhaFwdKernel(
+                    F_arch=factory.arch,
+                    F_idx=0,
+                    F_hdim=hdim,
+                    F_dtype=dtype,
+                    F_mode=mode,
+                    F_tile=tile,
+                    F_pipeline=pipeline,
+                    mask_impl=mask_impl,
+                )
+                if kernel_filter != "":
+                    if not fnmatch.fnmatch(k.name, kernel_filter):
+                        continue
+                if optdim_list != [-1]:
+                    if hdim not in optdim_list:
+                        continue
+                # 2 - Flash attention integration
+                if receipt in (2, 3):
+                    cond = dtype in ["fp16", "bf16"]
+                    cond &= pipeline.F_vlayout == "row"
+                    cond &= pipeline.F_bias in ["no", "alibi"]
+                    cond &= pipeline.F_squant == "f"
+                    cond &= pipeline.F_skip == "f"
+                    cond &= pipeline.F_sink == "f"
+                    if not cond:
+                        continue
+                # PyTorch integration
+                elif receipt == 4:
+                    cond = dtype in ["fp16", "bf16"]
+                    cond &= pipeline.F_vlayout == "row"
+                    cond &= pipeline.F_bias in ["no", "bias"]
+                    cond &= pipeline.F_squant == "f"
+                    cond &= pipeline.F_skip == "f"
+                    cond &= pipeline.F_sink == "f"
+                    if not cond:
+                        continue
+                # Aiter(mha_fwd) integration
+                elif receipt == 100:
+                    cond = dtype in ["fp16", "bf16"]
+                    cond &= mode == "batch"
+                    cond &= pipeline.F_vlayout == "row"
+                    cond &= pipeline.F_squant == "f"
+                    if not cond:
+                        continue
+                # Aiter(mha_varlen_fwd) integration
+                elif receipt == 200:
+                    cond = dtype in ["fp16", "bf16"]
+                    cond &= mode == "group"
+                    cond &= pipeline.F_vlayout == "row"
+                    cond &= pipeline.F_squant == "f"
+                    if not cond:
+                        continue
+                # aiter::mha_fwd C++ api integration
+                elif receipt == 600:
+                    cond = dtype in ["fp16", "bf16"]
+                    cond &= pipeline.F_vlayout == "row"
+                    cond &= pipeline.F_squant == "f"
+                    if not cond:
+                        continue
+
+                # fp32 only
+                if receipt == 800 or receipt == 801:
+                    cond = dtype == "fp32"
+                    if not cond:
+                        continue
+
+                api_pool.register_traits(k.api_trait())
+                gen.append(k)
+
+    return (api_pool, gen)
+
+
+def write_single_fwd_kernel(kernel: FmhaFwdKernel, autogen_dir: Path) -> None:
+    update_file(autogen_dir / kernel.filename, kernel.template)
+
+
+def write_fwd_api(api_pool: FmhaFwdApiPool, autogen_dir: Path) -> None:
+    update_file(autogen_dir / FMHA_FWD_API_FILENAME, api_pool.api)
+
+
+def write_blobs(
+    targets: List[str],
+    output_dir: Path,
+    kernel_filter: str,
+    receipt,
+    optdim_list,
+    mask_impl,
+) -> None:
+    api_pool, kernels = get_fwd_blobs(
+        targets, kernel_filter, receipt, optdim_list, mask_impl
+    )
+    for kernel in kernels:
+        write_single_fwd_kernel(kernel, output_dir)
+    write_fwd_api(api_pool, output_dir)
+
+
+def list_blobs(
+    targets: List[str],
+    file_path: Path,
+    kernel_filter: str,
+    receipt,
+    optdim_list,
+    mask_impl,
+) -> None:
+    with file_path.open("a") as f:
+        _, kernels = get_fwd_blobs(
+            targets, kernel_filter, receipt, optdim_list, mask_impl
+        )
+        for kernel in kernels:
+            f.write((file_path.parent / GEN_DIR / kernel.filename).as_posix() + "\n")
+        f.write((file_path.parent / GEN_DIR / FMHA_FWD_API_FILENAME).as_posix() + "\n")
--- a/example/ck_tile/01_fmha/codegen/utils.py
+++ b/example/ck_tile/01_fmha/codegen/utils.py
@@ -0,0 +1,70 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+# generate kernel instances to speed up compilation
+import dataclasses
+import os.path as path
+import textwrap
+
+
+def update_file(file_path, content):
+    """Update the file at file_path with the given content if it differs from the existing content.
+
+    It avoids unnecessary touching of the file which triggers rebuilds
+    """
+
+    existing_content = ""
+    if path.exists(file_path):
+        with open(file_path, "r") as file:
+            existing_content = file.read()
+    if existing_content == content:
+        return
+    with open(file_path, "w") as file:
+        file.write(content)
+
+
+def indent(code: str, indent: str = "    ") -> str:
+    return textwrap.indent(code, indent)
+
+
+def if_(i: int) -> str:
+    return "if" if i == 0 else "else if"
+
+
+def check_duplicates_and_paddings(traits, trait):
+    """Check
+    * if the traits list does not contain a trait with the same parameters;
+    * if paddings are consitent: the previous kernel can be incorrectly called before the new one,
+      for example, f, _t_, f, t cannot be before f, _f_, f, t.
+    """
+
+    fields = [f.name for f in dataclasses.fields(trait)]
+    pad_fields = [f for f in fields if "pad" in f]
+    non_pad_fields = [f for f in fields if "pad" not in f]
+    for prev_trait in traits:
+        if any(getattr(trait, f) != getattr(prev_trait, f) for f in non_pad_fields):
+            continue
+        if all(getattr(trait, f) == getattr(prev_trait, f) for f in pad_fields):
+            raise Exception(f"Duplicate found {trait}")
+        # Check if the previous kernel can be incorrectly used before the current one
+        # for example, f, _t_, f, t cannot be before f, _f_, f, t
+        is_prev_more_restrictive = False
+        is_curr_more_restrictive = False
+        for f in pad_fields:
+            prev_pad = getattr(prev_trait, f)
+            pad = getattr(trait, f)
+            if isinstance(prev_pad, str):
+                prev_pad = 1000000 if prev_pad == "f" else 1
+                pad = 1000000 if pad == "f" else 1
+            elif isinstance(prev_pad, int):
+                prev_pad = 1000000 if prev_pad == 0 else prev_pad
+                pad = 1000000 if pad == 0 else pad
+            else:
+                assert False
+            if prev_pad < pad:
+                is_prev_more_restrictive = True
+            elif prev_pad > pad:
+                is_curr_more_restrictive = True
+        if is_prev_more_restrictive and not is_curr_more_restrictive:
+            raise Exception(
+                f"Kernel will never be used because paddings are not ordered correctly:\n{prev_trait} supersedes\n{trait}"
+            )
--- a/example/ck_tile/01_fmha/example_fmha_bwd.cpp
+++ b/example/ck_tile/01_fmha/example_fmha_bwd.cpp
@@ -0,0 +1,199 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "ck_tile/host.hpp"
+#include "fmha_bwd.hpp"
+#include "fmha_bwd_runner.hpp"
+
+#include <string>
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("v", "1", "whether do CPU validation or not")
+        .insert("mode", "0", "kernel mode. 0:batch, 1:group")
+        .insert("b", "2", "batch size")
+        .insert("h", "8", "num of head, for q")
+        .insert("h_k",
+                "-1",
+                "num of head, for k/v, -1 means equal to h\n"
+                "if not equal to h, then this is GQA/MQA case")
+        .insert("s",
+                "3328",
+                "seqlen_q. if group-mode, means the average value of seqlen_q\n"
+                "total_seqlen_q = seqlen_q * batch, and seqlen_q per batch may vary\n"
+                "also with \"-s=s0,s1,s2...\" comma-separated ints to set seqlen per batch "
+                "(group mode)")
+        .insert("s_qpad",
+                "-1",
+                "padded seqlen_q per batch (group mode only). "
+                "Use \"-s_qpad=p0,p1,...\"; -1 disables explicit padding")
+        .insert("s_k",
+                "-1",
+                "seqlen_k, -1 means equal to s\n"
+                "also with \"-s_k=s0,s1,s2...\" comma-separated ints to set seqlen per batch "
+                "(group mode)")
+        .insert("s_kpad",
+                "-1",
+                "padded seqlen_k per batch (group mode only). "
+                "Use \"-s_kpad=k0,k1,...\"; -1 disables explicit padding")
+        .insert("d", "128", "head dim for q, k")
+        .insert("d_v", "-1", "head dim for v, -1 means equal to d")
+        .insert("scale", "0", "scale factor. 0 means equal to 1/sqrt(hdim)")
+        .insert("iperm",
+                "1",
+                "permute input\n"
+                "if true, will be b*h*s*d, else b*s*h*d")
+        .insert("operm", "1", "permute output")
+        .insert("bias",
+                "n",
+                "n or 0, no bias\n"
+                "e(lementwise) or 1, elementwise bias with 1*1*s*s. e:1, 1*h*s*s. e:2, b*h*s*s\n"
+                "a(libi) or 2, alibi with 1*h. a:1, b*h")
+        .insert("dbias", "0", "output bias gradient or not")
+        .insert("prec", "fp16", "data type. fp32/fp16/bf16")
+        .insert("mask",
+                "0",
+                "0: no mask, 1: top-left(same as 't'), 2:bottom-right(same as 'b')\n"
+                "'t', top-left causal mask, 'b', bottom-r causal mask\n"
+                "'t:l,r', top-left sliding window attn(swa) with FA style left right size\n"
+                "'b:l,r', bottom-r sliding window attn(swa) with FA style left right size\n"
+                "'xt:window_size', xformer style masking from top-left, window_size negative is "
+                "causal, positive is swa\n"
+                "'xb:window_size', xformer style masking from bottom-r, window_size negative is "
+                "causal, positive is swa\n"
+                "'g:y,x', generic attention mask coordinate with y/x size (only debug purpose for "
+                "now)")
+        .insert("kname", "0", "if set to 1 will print kernel name")
+        .insert("init",
+                "uf",
+                "init method:\n  ui or 0 - uniform random int\n  uf or 1 - uniform random float"
+                "\n  tf or 2 - trig float")
+        .insert("seed",
+                "11939",
+                "random seed used for initializing input tensors. 0 for "
+                "non-deterministic seed")
+        .insert("p_drop", "0", "0~1 probability of dropout")
+        .insert("drop_seed", "1", "seed for dropout random number generator")
+        .insert("drop_offset", "0", "offset for dropout random number generator")
+        .insert(
+            "drop_prefs",
+            "0",
+            "whether dropout seed and offset values are present on GPU; 0 - host, 1 - device/GPU")
+        .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
+        .insert("warmup", "5", "number of iterations before benchmark the kernel")
+        .insert("repeat", "20", "number of iterations to benchmark the kernel")
+        .insert("deterministic",
+                "0",
+                "if set to 1 will use multi-buffer reduction strategy for dq, atomic operation "
+                "will not be used")
+        .insert("json", "0", "0: No Json, 1: Dump Results in Json format")
+        .insert("jsonfile", "fmha_bwd.json", "json file name to dump results");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+template <typename DataTypeConfig>
+auto run(const ck_tile::ArgParser& arg_parser)
+{
+    std::string data_type    = arg_parser.get_str("prec");
+    int do_validation        = arg_parser.get_int("v");
+    mode_enum mode           = static_cast<mode_enum>(arg_parser.get_uint32("mode"));
+    ck_tile::index_t batch   = arg_parser.get_int("b");
+    ck_tile::index_t nhead   = arg_parser.get_int("h");
+    ck_tile::index_t nhead_k = arg_parser.get_int("h_k");
+    auto seqlen_qs           = arg_parser.get_int_vec("s");
+    auto seqlen_qpads        = arg_parser.get_int_vec("s_qpad");
+    auto seqlen_ks           = arg_parser.get_int_vec("s_k");
+    auto seqlen_kpads        = arg_parser.get_int_vec("s_kpad");
+    ck_tile::index_t hdim_q  = arg_parser.get_int("d");
+    ck_tile::index_t hdim_v  = arg_parser.get_int("d_v");
+    bool i_perm              = arg_parser.get_bool("iperm");
+    bool o_perm              = arg_parser.get_bool("operm");
+    float scale              = arg_parser.get_float("scale");
+    std::string bias_str     = arg_parser.get_str("bias");
+    bool use_dbias           = arg_parser.get_bool("dbias");
+    float p_drop             = arg_parser.get_float("p_drop");
+    uint64_t drop_seed       = arg_parser.get_uint64("drop_seed");
+    uint64_t drop_offset     = arg_parser.get_uint64("drop_offset");
+    bool drop_prefs          = arg_parser.get_bool("drop_prefs");
+    std::string mask_str     = arg_parser.get_str("mask");
+    bool deterministic       = arg_parser.get_bool("deterministic");
+    std::string init_method  = arg_parser.get_str("init");
+    uint32_t seed            = arg_parser.get_uint32("seed");
+
+    ck_tile::stream_config stream_config{nullptr,
+                                         true,
+                                         /* log_level = */ (arg_parser.get_bool("kname") ? 1 : 0),
+                                         arg_parser.get_int("warmup"),
+                                         arg_parser.get_int("repeat"),
+                                         arg_parser.get_str("timer") == std::string("gpu")};
+
+    auto json = arg_parser.get_int("json") == 1
+                    ? std::optional<std::string>{arg_parser.get_str("jsonfile")}
+                    : std::nullopt;
+
+    return fmha_bwd_run<DataTypeConfig>(mode,
+                                        batch,
+                                        nhead,
+                                        nhead_k,
+                                        seqlen_qs,
+                                        seqlen_ks,
+                                        seqlen_qpads,
+                                        seqlen_kpads,
+                                        hdim_q,
+                                        hdim_v,
+                                        i_perm,
+                                        o_perm,
+                                        scale,
+                                        bias_str,
+                                        use_dbias,
+                                        p_drop,
+                                        drop_seed,
+                                        drop_offset,
+                                        drop_prefs,
+                                        mask_str,
+                                        deterministic,
+                                        init_method,
+                                        seed,
+                                        do_validation,
+                                        stream_config,
+                                        json);
+}
+
+int main(int argc, char* argv[])
+{
+    try
+    {
+        auto [result, arg_parser] = create_args(argc, argv);
+        if(!result)
+            return -1;
+
+        const std::string data_type = arg_parser.get_str("prec");
+        if(data_type == "fp32")
+        {
+            return run<FmhaBwdFp32>(arg_parser) == bwd_result::success ? 0 : -2;
+        }
+        else if(data_type == "fp16")
+        {
+            return run<FmhaBwdFp16>(arg_parser) == bwd_result::success ? 0 : -2;
+        }
+        else if(data_type == "bf16")
+        {
+            return run<FmhaBwdBf16>(arg_parser) == bwd_result::success ? 0 : -2;
+        }
+        std::cerr << "Unsupported precision: " << data_type << std::endl;
+        return -1;
+    }
+    catch(const std::invalid_argument& e)
+    {
+        std::cerr << "Invalid argument: " << e.what() << std::endl;
+        return -1;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Error: " << e.what() << std::endl;
+        return -2;
+    }
+}
--- a/example/ck_tile/01_fmha/example_fmha_fwd.cpp
+++ b/example/ck_tile/01_fmha/example_fmha_fwd.cpp
@@ -0,0 +1,271 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "ck_tile/host.hpp"
+#include "fmha_fwd.hpp"
+#include "fmha_fwd_runner.hpp"
+
+#include <string>
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("v", "1", "0:no validation, 2:cpu validation, 2:gpu validation(experimental)")
+        .insert("mode", "0", "kernel mode. 0:batch, 1:group")
+        .insert("b", "2", "batch size")
+        .insert("h", "8", "num of head, for q")
+        .insert("h_k",
+                "-1",
+                "num of head, for k/v, -1 means equal to h\n"
+                "if not equal to h, then this is GQA/MQA case")
+        .insert("s",
+                "3328",
+                "seqlen_q. if group-mode, means the average value of seqlen_q\n"
+                "total_seqlen_q = seqlen_q * batch, and seqlen_q per batch may vary\n"
+                "also with \"-s=s0,s1,s2...\" comma-separated ints to set seqlen per batch "
+                "(group mode)")
+        .insert("s_k",
+                "-1",
+                "seqlen_k (including new key/value), -1 means equal to s\n"
+                "also with \"-s_k=s0,s1,s2...\" comma-separated ints to set seqlen per batch "
+                "(group mode)")
+        .insert("s_knew",
+                "0",
+                "seqlen_k for new key/value, 0 means not to use this at all; "
+                "-1 to choose s_knew in [1, s] randomly.")
+        .insert("s_qpad",
+                "-1",
+                "seqlen_q stride between 2 batches (group-mode optional).\n"
+                "Provide positive strides per-batch to simulate physical padding on Q.")
+        .insert("s_kpad",
+                "-1",
+                "seqlen_k stride between 2 batches, currently used in group-mode only\n"
+                "for kv-cache case, each batch [1,s,h,d]/[1,h,s,d] can have a stride\n"
+                "along seqlen, instead of packed, same as xformer kv_padding,\n"
+                "must be greater than or equal to s_k")
+        .insert("d", "128", "head dim for q, k")
+        .insert("d_v", "-1", "head dim for v, -1 means equal to d")
+        .insert("scale_s", "0", "scale factor of S. 0 means equal to 1/sqrt(hdim)")
+        .insert("qscale",
+                "n",
+                "quant scale:\n"
+                "  n or 0, no scale\n"
+                "  pt or 1, per-tensor scale\n"
+                "  bs or 2, block scale\n"
+                "  kvbs or 3, Q per-tensor, K/V per-page block scale\n"
+                "  mx or 4, microscaling (exclusively for data types like mxfp8 and mxfp4)")
+        .insert("logits_soft_cap", "0", "attention logits soft capping value.")
+        .insert("iperm",
+                "1",
+                "permute input\n"
+                "if true, will be b*h*s*d, else b*s*h*d")
+        .insert("operm", "1", "permute output")
+        .insert("bias",
+                "n",
+                "n or 0, no bias\n"
+                "e(lementwise) or 1, elementwise bias with 1*1*s*s. e:1, 1*h*s*s. e:2, b*h*s*s\n"
+                "a(libi) or 2, alibi with 1*h. a:1, b*h")
+        .insert("prec", "fp16", "data type: fp32/fp16/bf16/fp8/fp8bf16/fp8fp32/mxfp8/mxfp4")
+        .insert("mask",
+                "0",
+                "0: no mask, 1: top-left(same as 't'), 2:bottom-right(same as 'b')\n"
+                "'t', top-left causal mask, 'b', bottom-r causal mask\n"
+                "'t:l,r', top-left sliding window attn(swa) with FA style left right size\n"
+                "'b:l,r', bottom-r sliding window attn(swa) with FA style left right size\n"
+                "'xt:window_size', xformer style masking from top-left, window_size negative is "
+                "causal, positive is swa\n"
+                "'xb:window_size', xformer style masking from bottom-r, window_size negative is "
+                "causal, positive is swa\n"
+                "'g:y,x', generic attention mask coordinate with y/x size (only debug purpose for "
+                "now)")
+        .insert("vlayout", "r", "r for row-major(seqlen*hdim), c for col-major(hdim*seqlen)")
+        .insert("lse", "0", "0 not store lse, 1 store lse")
+        .insert("kname", "0", "if set to 1 will print kernel name")
+        .insert("init",
+                "uf",
+                "init method:\n  ui or 0 - uniform random int\n  ni - normalized random int"
+                "\n  uf or 1 - uniform random float\n  nf - normalized random float"
+                "\n  tf or 2 - trig float"
+                "\n  tf or 3 - uniform random float, min max is the max of the type\n")
+        .insert("seed",
+                "11939",
+                "random seed used for initializing input tensors. 0 for "
+                "non-deterministic seed")
+        .insert("p_drop", "0", "0~1 probability of dropout")
+        .insert("drop_seed", "1", "seed for dropout random number generator")
+        .insert("drop_offset", "0", "offset for dropout random number generator")
+        .insert(
+            "drop_prefs",
+            "0",
+            "whether dropout seed and offset values are present on GPU; 0 - host, 1 - device/GPU")
+        .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
+        .insert(
+            "rotary_dim", "0", "RoPE rotary dimension. rotary_dim <= 0 means not apply RoPE at all")
+        .insert("rotary_interleaved", "1", "whether to apply interleaved RoPE")
+        .insert("num_splits",
+                "1",
+                "# of splits for key/value. 0 to determine actual number by heuristic")
+        .insert("page_block_size", "0", "paged-kvcache block size. 0 means not use paged-kvcahe")
+        .insert("cache_batch_idx", "0", "whether to use index map to the kvcache")
+        .insert("warmup", "5", "number of iterations before benchmark the kernel")
+        .insert("repeat", "20", "number of iterations to benchmark the kernel")
+        .insert("json", "0", "0: No Json, 1: Dump Results in Json format")
+        .insert("jsonfile", "fmha_fwd.json", "json file name to dump results")
+        .insert("q_eff_lens",
+                "",
+                "Batch-mode only: per-batch effective seqlen for Q (exclude PAD).\n"
+                "Comma-separated list of length 'b'. If empty, no override.")
+        .insert("kv_eff_lens",
+                "",
+                "Batch-mode only: per-batch effective seqlen for KV (exclude PAD).\n"
+                "Comma-separated list of length 'b'. If empty, no override.")
+        .insert("init_sink", "0", "value to init the output tensor sink value for validation");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+template <typename DataTypeConfig>
+auto run(const ck_tile::ArgParser& arg_parser)
+{
+    int do_validation                = arg_parser.get_int("v");
+    mode_enum mode                   = static_cast<mode_enum>(arg_parser.get_uint32("mode"));
+    ck_tile::index_t batch           = arg_parser.get_int("b");
+    ck_tile::index_t nhead           = arg_parser.get_int("h");
+    ck_tile::index_t nhead_k         = arg_parser.get_int("h_k");
+    auto seqlen_qs                   = arg_parser.get_int_vec("s");
+    auto seqlen_ks                   = arg_parser.get_int_vec("s_k");
+    ck_tile::index_t hdim_q          = arg_parser.get_int("d");
+    ck_tile::index_t hdim_v          = arg_parser.get_int("d_v");
+    ck_tile::index_t seqlen_knew     = arg_parser.get_int("s_knew");
+    auto seqlen_kpads                = arg_parser.get_int_vec("s_kpad");
+    auto seqlen_qpads                = arg_parser.get_int_vec("s_qpad");
+    auto q_eff_lens_per_batch        = arg_parser.get_int_vec("q_eff_lens");
+    auto kv_eff_lens_per_batch       = arg_parser.get_int_vec("kv_eff_lens");
+    ck_tile::index_t rotary_dim      = arg_parser.get_int("rotary_dim");
+    bool i_perm                      = arg_parser.get_bool("iperm");
+    bool o_perm                      = arg_parser.get_bool("operm");
+    float scale_s                    = arg_parser.get_float("scale_s");
+    float logits_soft_cap            = arg_parser.get_float("logits_soft_cap");
+    bool is_v_rowmajor               = arg_parser.get_str("vlayout") == "r";
+    bool lse                         = arg_parser.get_bool("lse");
+    ck_tile::index_t page_block_size = arg_parser.get_int("page_block_size");
+    bool use_cache_batch_idx         = arg_parser.get_bool("cache_batch_idx");
+    std::string bias_str             = arg_parser.get_str("bias");
+    std::string qscale_str           = arg_parser.get_str("qscale");
+    float p_drop                     = arg_parser.get_float("p_drop");
+    uint64_t drop_seed               = arg_parser.get_uint64("drop_seed");
+    uint64_t drop_offset             = arg_parser.get_uint64("drop_offset");
+    bool drop_prefs                  = arg_parser.get_bool("drop_prefs");
+    std::string mask_str             = arg_parser.get_str("mask");
+    bool is_rotary_interleaved       = arg_parser.get_bool("rotary_interleaved");
+    ck_tile::index_t num_splits      = arg_parser.get_int("num_splits");
+    std::string init_method          = arg_parser.get_str("init");
+    uint32_t seed                    = arg_parser.get_uint32("seed");
+    int init_sink_value              = arg_parser.get_int("init_sink");
+
+    ck_tile::stream_config stream_config{nullptr,
+                                         true,
+                                         /* log_level = */ (arg_parser.get_bool("kname") ? 1 : 0),
+                                         arg_parser.get_int("warmup"),
+                                         arg_parser.get_int("repeat"),
+                                         arg_parser.get_str("timer") == std::string("gpu")};
+
+    auto json = arg_parser.get_int("json") == 1
+                    ? std::optional<std::string>{arg_parser.get_str("jsonfile")}
+                    : std::nullopt;
+
+    return fmha_fwd_run<DataTypeConfig>(mode,
+                                        batch,
+                                        nhead,
+                                        nhead_k,
+                                        seqlen_qs,
+                                        seqlen_ks,
+                                        hdim_q,
+                                        hdim_v,
+                                        seqlen_knew,
+                                        seqlen_qpads,
+                                        seqlen_kpads,
+                                        q_eff_lens_per_batch,
+                                        kv_eff_lens_per_batch,
+                                        rotary_dim,
+                                        i_perm,
+                                        o_perm,
+                                        scale_s,
+                                        logits_soft_cap,
+                                        is_v_rowmajor,
+                                        lse,
+                                        page_block_size,
+                                        use_cache_batch_idx,
+                                        bias_str,
+                                        p_drop,
+                                        drop_seed,
+                                        drop_offset,
+                                        drop_prefs,
+                                        mask_str,
+                                        qscale_str,
+                                        is_rotary_interleaved,
+                                        num_splits,
+                                        init_method,
+                                        seed,
+                                        do_validation,
+                                        init_sink_value,
+                                        stream_config,
+                                        json);
+}
+
+int main(int argc, char* argv[])
+{
+    try
+    {
+        auto [result, arg_parser] = create_args(argc, argv);
+        if(!result)
+            return -1;
+
+        const std::string data_type = arg_parser.get_str("prec");
+        if(data_type == "fp32")
+        {
+            return run<FmhaFwdFp32>(arg_parser) == fwd_result::success ? 0 : -2;
+        }
+        else if(data_type == "fp16")
+        {
+            return run<FmhaFwdFp16>(arg_parser) == fwd_result::success ? 0 : -2;
+        }
+        else if(data_type == "bf16")
+        {
+            return run<FmhaFwdBf16>(arg_parser) == fwd_result::success ? 0 : -2;
+        }
+        else if(data_type == "fp8")
+        {
+            return run<FmhaFwdFp8>(arg_parser) == fwd_result::success ? 0 : -2;
+        }
+        else if(data_type == "fp8bf16")
+        {
+            return run<FmhaFwdFp8Bf16>(arg_parser) == fwd_result::success ? 0 : -2;
+        }
+        else if(data_type == "fp8fp32")
+        {
+            return run<FmhaFwdFp8Fp32>(arg_parser) == fwd_result::success ? 0 : -2;
+        }
+        else if(data_type == "mxfp8")
+        {
+            return run<FmhaFwdMxFp8>(arg_parser) == fwd_result::success ? 0 : -2;
+        }
+        else if(data_type == "mxfp4")
+        {
+            return run<FmhaFwdMxFp4>(arg_parser) == fwd_result::success ? 0 : -2;
+        }
+        std::cerr << "Unsupported precision: " << data_type << std::endl;
+        return -1;
+    }
+    catch(const std::invalid_argument& e)
+    {
+        std::cerr << "Invalid argument: " << e.what() << std::endl;
+        return -1;
+    }
+    catch(const std::exception& e)
+    {
+        std::cerr << "Error: " << e.what() << std::endl;
+        return -2;
+    }
+}
--- a/example/ck_tile/01_fmha/fmha_bwd.hpp
+++ b/example/ck_tile/01_fmha/fmha_bwd.hpp
@@ -0,0 +1,588 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/device_prop.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/fmha.hpp"
+#include "ck_tile/ops/epilogue.hpp"
+#include "mask.hpp"
+#include "bias.hpp"
+
+#include <type_traits>
+#include <utility>
+#include <variant>
+#include <iostream>
+#include <functional>
+
+struct FmhaBwdFp32
+{
+};
+
+struct FmhaBwdFp16
+{
+};
+
+struct FmhaBwdBf16
+{
+};
+
+template <typename DataType>
+struct FmhaBwdTypeConfig;
+
+template <>
+struct FmhaBwdTypeConfig<FmhaBwdFp32>
+{
+    using QDataType             = float;
+    using KDataType             = float;
+    using VDataType             = float;
+    using GemmDataType          = float;
+    using BiasDataType          = float;
+    using LSEDataType           = float;
+    using AccDataType           = float; // data type for gemm accumulation
+    using DDataType             = float;
+    using RandValOutputDataType = uint8_t;
+    using ODataType             = float;
+    using OGradDataType         = float;
+    using QGradDataType         = float;
+    using KGradDataType         = float;
+    using VGradDataType         = float;
+    using BiasGradDataType      = float;
+};
+
+template <>
+struct FmhaBwdTypeConfig<FmhaBwdFp16>
+{
+    using QDataType             = ck_tile::half_t;
+    using KDataType             = ck_tile::half_t;
+    using VDataType             = ck_tile::half_t;
+    using GemmDataType          = ck_tile::half_t;
+    using BiasDataType          = ck_tile::half_t;
+    using LSEDataType           = float;
+    using AccDataType           = float; // data type for gemm accumulation
+    using DDataType             = float;
+    using RandValOutputDataType = uint8_t;
+    using ODataType             = ck_tile::half_t;
+    using OGradDataType         = ck_tile::half_t;
+    using QGradDataType         = ck_tile::half_t;
+    using KGradDataType         = ck_tile::half_t;
+    using VGradDataType         = ck_tile::half_t;
+    using BiasGradDataType      = ck_tile::half_t;
+};
+
+template <>
+struct FmhaBwdTypeConfig<FmhaBwdBf16>
+{
+    using QDataType             = ck_tile::bf16_t;
+    using KDataType             = ck_tile::bf16_t;
+    using VDataType             = ck_tile::bf16_t;
+    using GemmDataType          = ck_tile::bf16_t;
+    using BiasDataType          = ck_tile::bf16_t;
+    using LSEDataType           = float;
+    using AccDataType           = float; // data type for gemm accumulation
+    using DDataType             = float;
+    using RandValOutputDataType = uint8_t;
+    using ODataType             = ck_tile::bf16_t;
+    using OGradDataType         = ck_tile::bf16_t;
+    using QGradDataType         = ck_tile::bf16_t;
+    using KGradDataType         = ck_tile::bf16_t;
+    using VGradDataType         = ck_tile::bf16_t;
+    using BiasGradDataType      = ck_tile::bf16_t;
+};
+
+struct FmhaMasks
+{
+    using NoMask      = ck_tile::GenericAttentionMask<false>;
+    using GenericMask = ck_tile::GenericAttentionMask<true, true>;
+    using CausalMask  = ck_tile::GenericAttentionMask<true, false>;
+};
+
+// runtime args, some will passed to karg, some will used to compute grids/blocks
+struct fmha_bwd_args
+{
+    const void* q_ptr;
+    const void* k_ptr;
+    const void* v_ptr;
+    const void* bias_ptr; // bias or alibi_slope pointer
+    const void* o_ptr;
+    const void* lse_ptr;
+    const void* do_ptr;
+    void* d_ptr;
+    void* rand_val_ptr;
+    void* dq_ptr;
+    void* dk_ptr;
+    void* dv_ptr;
+    void* dbias_ptr;
+    void* dq_acc_ptr;
+
+    // Usage notes for sequence length pointer parameters:
+    //
+    // [Note: Define "Group mode" vs "Batch mode" here if possible, e.g., "Group mode handles
+    // MQA/GQA..."]
+    //
+    // With padding:
+    //   Group mode:
+    //     - seqstart_q_ptr, seqstart_k_ptr: Record cumulative physical (including padding) sequence
+    //     lengths. [array size: batch + 1]
+    //     - seqlen_q_ptr/seqlen_k_ptr: Records logical (excluding padding) length for each
+    //     sequence. [array size: batch]
+    //     - cu_seqlen_q_ptr/cu_seqlen_k_ptr: Records cumulative logical (excluding padding)
+    //     sequence lengths. [array size: batch + 1]
+    //     - seqlen_q_ptr (per-sequence) and cu_seqlen_q_ptr (cumulative logical) are mutually
+    //     exclusive. Use one set, not both.
+    //
+    //   Batch mode:
+    //     - cu_seqlen_q_ptr/cu_seqlen_k_ptr: Records cumulative logical (excluding padding)
+    //     sequence lengths. [array size: batch + 1]
+    //     - seqstart_* and seqlen_* pointers must be nullptr.
+    //
+    // Without padding:
+    //   (Note: Physical length equals logical length)
+    //
+    //   Group mode:
+    //     - seqstart_q_ptr, seqstart_k_ptr: Record cumulative physical sequence lengths. [array
+    //     size: batch + 1]
+    //     - seqlen_q_ptr/seqlen_k_ptr and cu_seqlen_q_ptr/cu_seqlen_k_ptr must be nullptr.
+    //
+    //   Batch mode:
+    //     - All sequence length pointers (seqstart_*, seqlen_*, cu_seqlen_*) must be nullptr.
+    //
+    const void* seqstart_q_ptr =
+        nullptr; // Cumulative physical sequence length array [batch + 1]. (Used in Group mode)
+    const void* seqstart_k_ptr =
+        nullptr; // Cumulative physical sequence length array [batch + 1]. (Used in Group mode)
+    const void* seqlen_q_ptr = nullptr;    // Per-sequence logical (excluding padding) length array
+                                           // [batch]. (Used in Group mode with padding)
+    const void* seqlen_k_ptr = nullptr;    // Per-sequence logical (excluding padding) length array
+                                           // [batch]. (Used in Group mode with padding)
+    const void* cu_seqlen_q_ptr = nullptr; // Cumulative logical (excluding padding) sequence length
+                                           // array [batch + 1]. (Used with padding)
+    const void* cu_seqlen_k_ptr = nullptr; // Cumulative logical (excluding padding) sequence length
+                                           // array [batch + 1]. (Used with padding)
+    ck_tile::index_t seqlen_q;
+    ck_tile::index_t seqlen_k;
+    ck_tile::index_t batch;
+    ck_tile::index_t max_seqlen_q;
+    ck_tile::index_t max_seqlen_k;
+    ck_tile::index_t hdim_q;
+    ck_tile::index_t hdim_v;
+    ck_tile::index_t nhead_q;
+    ck_tile::index_t nhead_k;
+    float scale;
+    ck_tile::index_t stride_q;
+    ck_tile::index_t stride_k;
+    ck_tile::index_t stride_v;
+    ck_tile::index_t stride_bias; // if alibi, b*h need set this to h, 1*h need set this to 0
+    ck_tile::index_t stride_o;
+    ck_tile::index_t stride_randval;
+    ck_tile::index_t stride_do;
+    ck_tile::index_t stride_dq_acc;
+    ck_tile::index_t stride_dq;
+    ck_tile::index_t stride_dk;
+    ck_tile::index_t stride_dv;
+    ck_tile::index_t stride_dbias;
+    ck_tile::index_t nhead_stride_q;
+    ck_tile::index_t nhead_stride_k;
+    ck_tile::index_t nhead_stride_v;
+    ck_tile::index_t nhead_stride_bias;
+    ck_tile::index_t nhead_stride_o;
+    ck_tile::index_t nhead_stride_randval;
+    ck_tile::index_t nhead_stride_do;
+    ck_tile::index_t nhead_stride_lsed;
+    ck_tile::long_index_t nhead_stride_dq_acc;
+    ck_tile::index_t nhead_stride_dq;
+    ck_tile::index_t nhead_stride_dk;
+    ck_tile::index_t nhead_stride_dv;
+    ck_tile::index_t nhead_stride_dbias;
+    ck_tile::index_t batch_stride_q;
+    ck_tile::index_t batch_stride_k;
+    ck_tile::index_t batch_stride_v;
+    ck_tile::index_t batch_stride_bias;
+    ck_tile::index_t batch_stride_o;
+    ck_tile::index_t batch_stride_randval;
+    ck_tile::index_t batch_stride_do;
+    ck_tile::index_t batch_stride_lsed;
+    ck_tile::long_index_t batch_stride_dq_acc;
+    ck_tile::index_t batch_stride_dq;
+    ck_tile::index_t batch_stride_dk;
+    ck_tile::index_t batch_stride_dv;
+    ck_tile::index_t batch_stride_dbias;
+    ck_tile::index_t split_stride_dq_acc;
+    ck_tile::index_t window_size_left;
+    ck_tile::index_t window_size_right;
+    ck_tile::index_t mask_type;
+    float p_drop;
+    float p_undrop;
+    std::variant<std::pair<uint64_t, uint64_t>, std::pair<const void*, const void*>>
+        drop_seed_offset;
+};
+
+template <typename FmhaBwdDQDKDVKernel>
+auto fmha_bwd_dq_dk_dv_create_kargs_and_grids(fmha_bwd_args args)
+{
+    assert(args.nhead_q % args.nhead_k == 0);
+    auto kargs = [&] {
+        constexpr bool dq_uss_acc  = FmhaBwdDQDKDVKernel::kMaxSeqLenQ == 0;
+        const auto dq_ptr          = dq_uss_acc ? args.dq_acc_ptr : args.dq_ptr;
+        const auto stride_dq       = dq_uss_acc ? args.stride_dq_acc : args.stride_dq;
+        const auto nhead_stride_dq = dq_uss_acc ? args.nhead_stride_dq_acc : args.nhead_stride_dq;
+        const auto batch_stride_dq = dq_uss_acc ? args.batch_stride_dq_acc : args.batch_stride_dq;
+
+        // create group mode kernel arguments
+        if constexpr(FmhaBwdDQDKDVKernel::kIsGroupMode)
+        {
+            return FmhaBwdDQDKDVKernel::MakeKargsImpl(args.q_ptr,
+                                                      args.k_ptr,
+                                                      args.v_ptr,
+                                                      args.bias_ptr,
+                                                      args.lse_ptr,
+                                                      args.do_ptr,
+                                                      args.d_ptr,
+                                                      args.rand_val_ptr,
+                                                      args.dk_ptr,
+                                                      args.dv_ptr,
+                                                      args.dbias_ptr,
+                                                      dq_ptr,
+                                                      args.seqstart_q_ptr,
+                                                      args.seqstart_k_ptr,
+                                                      args.seqlen_q_ptr,
+                                                      args.seqlen_k_ptr,
+                                                      args.cu_seqlen_q_ptr,
+                                                      args.cu_seqlen_k_ptr,
+                                                      args.batch,
+                                                      args.hdim_q,
+                                                      args.hdim_v,
+                                                      args.nhead_q,
+                                                      args.nhead_q / args.nhead_k,
+                                                      args.scale,
+                                                      args.stride_q,
+                                                      args.stride_k,
+                                                      args.stride_v,
+                                                      args.stride_bias,
+                                                      args.stride_randval,
+                                                      args.stride_do,
+                                                      stride_dq,
+                                                      args.stride_dk,
+                                                      args.stride_dv,
+                                                      args.stride_dbias,
+                                                      args.nhead_stride_q,
+                                                      args.nhead_stride_k,
+                                                      args.nhead_stride_v,
+                                                      args.nhead_stride_bias,
+                                                      args.nhead_stride_randval,
+                                                      args.nhead_stride_do,
+                                                      args.nhead_stride_lsed,
+                                                      nhead_stride_dq,
+                                                      args.nhead_stride_dk,
+                                                      args.nhead_stride_dv,
+                                                      args.nhead_stride_dbias,
+                                                      args.split_stride_dq_acc,
+                                                      args.window_size_left,
+                                                      args.window_size_right,
+                                                      args.mask_type,
+                                                      args.p_drop,
+                                                      args.drop_seed_offset);
+        }
+        else
+        { // create batch mode kernel arguments
+            return FmhaBwdDQDKDVKernel::MakeKargsImpl(args.q_ptr,
+                                                      args.k_ptr,
+                                                      args.v_ptr,
+                                                      args.bias_ptr,
+                                                      args.lse_ptr,
+                                                      args.do_ptr,
+                                                      args.d_ptr,
+                                                      args.rand_val_ptr,
+                                                      args.dk_ptr,
+                                                      args.dv_ptr,
+                                                      args.dbias_ptr,
+                                                      dq_ptr,
+                                                      args.seqlen_q,
+                                                      args.seqlen_k,
+                                                      args.batch,
+                                                      args.hdim_q,
+                                                      args.hdim_v,
+                                                      args.nhead_q,
+                                                      args.nhead_q / args.nhead_k,
+                                                      args.scale,
+                                                      args.stride_q,
+                                                      args.stride_k,
+                                                      args.stride_v,
+                                                      args.stride_bias,
+                                                      args.stride_randval,
+                                                      args.stride_do,
+                                                      stride_dq,
+                                                      args.stride_dk,
+                                                      args.stride_dv,
+                                                      args.stride_dbias,
+                                                      args.nhead_stride_q,
+                                                      args.nhead_stride_k,
+                                                      args.nhead_stride_v,
+                                                      args.nhead_stride_bias,
+                                                      args.nhead_stride_randval,
+                                                      args.nhead_stride_do,
+                                                      args.nhead_stride_lsed,
+                                                      nhead_stride_dq,
+                                                      args.nhead_stride_dk,
+                                                      args.nhead_stride_dv,
+                                                      args.nhead_stride_dbias,
+                                                      args.batch_stride_q,
+                                                      args.batch_stride_k,
+                                                      args.batch_stride_v,
+                                                      args.batch_stride_bias,
+                                                      args.batch_stride_randval,
+                                                      args.batch_stride_do,
+                                                      args.batch_stride_lsed,
+                                                      batch_stride_dq,
+                                                      args.batch_stride_dk,
+                                                      args.batch_stride_dv,
+                                                      args.batch_stride_dbias,
+                                                      args.split_stride_dq_acc,
+                                                      args.window_size_left,
+                                                      args.window_size_right,
+                                                      args.mask_type,
+                                                      args.p_drop,
+                                                      args.drop_seed_offset);
+        }
+    }();
+
+    dim3 grids = FmhaBwdDQDKDVKernel::GridSize(args.batch, args.nhead_q, args.max_seqlen_k);
+    return ck_tile::make_tuple(kargs, grids);
+}
+
+template <typename FmhaBwdOGradDotOKernel>
+auto fmha_bwd_dot_do_o_create_kargs_and_grids(fmha_bwd_args args)
+{
+    auto kargs = [&] {
+        // create group mode kernel arguments
+        if constexpr(FmhaBwdOGradDotOKernel::kIsGroupMode)
+        {
+            return FmhaBwdOGradDotOKernel::MakeKargs(args.o_ptr,
+                                                     args.do_ptr,
+                                                     args.d_ptr,
+                                                     args.p_undrop,
+                                                     args.seqstart_q_ptr,
+                                                     args.seqlen_q_ptr,
+                                                     args.cu_seqlen_q_ptr,
+                                                     args.hdim_v,
+                                                     args.stride_do,
+                                                     args.stride_o,
+                                                     args.nhead_stride_do,
+                                                     args.nhead_stride_o,
+                                                     args.nhead_stride_lsed);
+        }
+        else
+        { // create batch mode kernel arguments
+            return FmhaBwdOGradDotOKernel::MakeKargs(args.o_ptr,
+                                                     args.do_ptr,
+                                                     args.d_ptr,
+                                                     args.p_undrop,
+                                                     args.seqlen_q,
+                                                     args.hdim_v,
+                                                     args.stride_do,
+                                                     args.stride_o,
+                                                     args.nhead_stride_do,
+                                                     args.nhead_stride_o,
+                                                     args.nhead_stride_lsed,
+                                                     args.batch_stride_do,
+                                                     args.batch_stride_o,
+                                                     args.batch_stride_lsed);
+        }
+    }();
+
+    dim3 grids = FmhaBwdOGradDotOKernel::GridSize(args.batch, args.nhead_q, args.max_seqlen_q);
+    return ck_tile::make_tuple(kargs, grids);
+}
+
+template <typename FmhaBwdConvertQGradKernel>
+auto fmha_bwd_convert_dq_create_kargs_and_grids(fmha_bwd_args args)
+{
+    auto kargs = [&] {
+        // create group mode kernel arguments
+        if constexpr(FmhaBwdConvertQGradKernel::kIsGroupMode)
+        {
+            return FmhaBwdConvertQGradKernel::MakeKargs(args.dq_acc_ptr,
+                                                        args.dq_ptr,
+                                                        args.seqstart_q_ptr,
+                                                        args.seqstart_k_ptr,
+                                                        args.seqlen_q_ptr,
+                                                        args.seqlen_k_ptr,
+                                                        args.cu_seqlen_q_ptr,
+                                                        args.cu_seqlen_k_ptr,
+                                                        args.hdim_q,
+                                                        args.stride_dq,
+                                                        args.stride_dq_acc,
+                                                        args.nhead_stride_dq,
+                                                        args.nhead_stride_dq_acc,
+                                                        args.split_stride_dq_acc);
+        }
+        else
+        { // create batch mode kernel arguments
+            return FmhaBwdConvertQGradKernel::MakeKargs(args.dq_acc_ptr,
+                                                        args.dq_ptr,
+                                                        args.seqlen_q,
+                                                        args.seqlen_k,
+                                                        args.hdim_q,
+                                                        args.stride_dq,
+                                                        args.stride_dq_acc,
+                                                        args.nhead_stride_dq,
+                                                        args.nhead_stride_dq_acc,
+                                                        args.batch_stride_dq,
+                                                        args.batch_stride_dq_acc,
+                                                        args.split_stride_dq_acc,
+                                                        args.batch,
+                                                        args.nhead_q);
+        }
+    }();
+
+    dim3 grids = FmhaBwdConvertQGradKernel::GridSize(args.batch, args.nhead_q, args.max_seqlen_q);
+    return ck_tile::make_tuple(kargs, grids);
+}
+
+// this is used to pattern-match internl kernel implementation, not to instantiate kernel
+template <ck_tile::index_t HDim_,
+          typename DataType_,
+          bool kIsGroupMode_,
+          typename FmhaMask_,
+          typename FmhaDropout_,
+          ck_tile::BlockAttentionBiasEnum BiasEnum_,
+          bool kHasBiasGrad_,
+          ck_tile::index_t kPadD_,
+          ck_tile::index_t kPadDv_,
+          bool kIsDeterministic_,
+          bool kUseTrLoad_,
+          ck_tile::index_t MaxSeqLenQ_,
+          ck_tile::index_t kN0>
+struct fmha_bwd_dq_dk_dv_traits_
+{
+};
+
+template <typename Traits_, typename Arch = void>
+float fmha_bwd_dq_dk_dv_(const ck_tile::stream_config&, fmha_bwd_args);
+
+template <typename Traits_, typename Arch = void>
+void fmha_bwd_dq_dk_dv_oneshot_(const ck_tile::stream_config&, fmha_bwd_args);
+
+template <typename Traits_, typename Arch = void>
+std::string fmha_bwd_dq_dk_dv_get_name_();
+template <typename Traits_, typename Arch = void>
+int fmha_bwd_dq_dk_dv_maxq_();
+struct fmha_bwd_traits;
+template <typename Traits_, typename Arch = void>
+int fmha_bwd_dq_dk_dv_dq_acc_splits_(const fmha_bwd_traits& t);
+template <typename Traits_, typename Arch = void>
+bool fmha_bwd_dq_dk_dv_needs_zero_dq_acc_();
+
+template <ck_tile::index_t HDim_, typename DataType_, bool kIsGroupMode_, bool kPadS_, bool kPadDv_>
+struct fmha_bwd_dot_do_o_traits_
+{
+    static constexpr ck_tile::index_t HDim = HDim_;
+    using DataType                         = ck_tile::remove_cvref_t<DataType_>;
+    static constexpr bool kIsGroupMode     = kIsGroupMode_;
+    static constexpr bool kPadS            = kPadS_;
+    static constexpr bool kPadDv           = kPadDv_;
+};
+
+template <typename Traits_, typename Arch = void>
+float fmha_bwd_dot_do_o_(const ck_tile::stream_config&, fmha_bwd_args);
+
+template <typename Traits_, typename Arch = void>
+void fmha_bwd_dot_do_o_oneshot_(const ck_tile::stream_config&, fmha_bwd_args);
+
+template <typename Traits_, typename Arch = void>
+std::string fmha_bwd_dot_do_o_get_name_();
+
+template <ck_tile::index_t HDim_,
+          typename DataType_,
+          bool kIsGroupMode_,
+          bool kPadS_,
+          bool kPadD_,
+          bool kIsDeterministic_,
+          ck_tile::index_t kN0>
+struct fmha_bwd_convert_dq_traits_
+{
+};
+
+template <typename Traits_, typename Arch = void>
+float fmha_bwd_convert_dq_(const ck_tile::stream_config&, fmha_bwd_args);
+
+template <typename Traits_, typename Arch = void>
+void fmha_bwd_convert_dq_oneshot_(const ck_tile::stream_config&, fmha_bwd_args);
+
+template <typename Traits_, typename Arch = void>
+std::string fmha_bwd_convert_dq_get_name_();
+
+// Traits that are used to dispatch different kernel implementations for fmha backward
+struct fmha_bwd_traits
+{
+    int seqlen_q;
+    int seqlen_k;
+    int batch;
+    int max_seqlen_q;
+    int max_seqlen_k;
+    int hdim_q;
+    int hdim_v;
+    int nhead_q;
+    int nhead_k;
+    std::string data_type;
+    bool is_group_mode;
+    mask_enum mask_type;
+    bias_enum bias_type; // 0:no bias, 1:elementwise bias, 2:alibi. sync with BlockAttentionBiasEnum
+    bool has_dbias;
+    bool has_dropout;
+    bool is_store_randval;
+    bool is_deterministic;
+    // TODO: padding check is inside this api
+};
+
+template <typename T0 /*dot_do_o_trait*/,
+          typename T1 /*dq_dk_dv_trait*/,
+          typename T2 /*convert_dq_trait*/,
+          typename Arch>
+float fmha_bwd_(const ck_tile::stream_config& s, fmha_bwd_args a)
+{
+    if constexpr(!std::is_same_v<T2, void>)
+    {
+        if(s.log_level_ > 0)
+            std::cout << ", " << fmha_bwd_dot_do_o_get_name_<T0, Arch>() << "@"
+                      << fmha_bwd_convert_dq_get_name_<T2, Arch>() << "@"
+                      << fmha_bwd_dq_dk_dv_get_name_<T1, Arch>() << std::flush;
+        return ck_tile::launch_kernel(
+            s,
+            [=](const ck_tile::stream_config& s_) { fmha_bwd_dot_do_o_oneshot_<T0, Arch>(s_, a); },
+            [=](const ck_tile::stream_config& s_) { fmha_bwd_dq_dk_dv_oneshot_<T1, Arch>(s_, a); },
+            [=](const ck_tile::stream_config& s_) {
+                fmha_bwd_convert_dq_oneshot_<T2, Arch>(s_, a);
+            });
+    }
+    else
+    {
+        if(s.log_level_ > 0)
+            std::cout << ", " << fmha_bwd_dot_do_o_get_name_<T0, Arch>() << "@"
+                      << fmha_bwd_dq_dk_dv_get_name_<T1, Arch>() << std::flush;
+        return ck_tile::launch_kernel(
+            s,
+            [=](const ck_tile::stream_config& s_) { fmha_bwd_dot_do_o_oneshot_<T0, Arch>(s_, a); },
+            [=](const ck_tile::stream_config& s_) { fmha_bwd_dq_dk_dv_oneshot_<T1, Arch>(s_, a); });
+    }
+}
+
+template <int Version = 2>
+float fmha_bwd(const fmha_bwd_traits&, fmha_bwd_args, const ck_tile::stream_config&);
+
+struct fmha_bwd_launcher
+{
+    std::function<float(fmha_bwd_args, const ck_tile::stream_config&)> run{};
+    ck_tile::index_t dq_acc_splits{0};
+    bool needs_zero_dq_acc{true};
+
+    fmha_bwd_launcher(const fmha_bwd_traits&);
+
+    template <typename... Args>
+    float operator()(Args&&... args) const
+    {
+        return run(std::forward<Args>(args)...);
+    }
+};
--- a/example/ck_tile/01_fmha/fmha_bwd_runner.hpp
+++ b/example/ck_tile/01_fmha/fmha_bwd_runner.hpp
--- a/example/ck_tile/01_fmha/fmha_fwd.hpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.hpp
--- a/example/ck_tile/01_fmha/fmha_fwd_runner.hpp
+++ b/example/ck_tile/01_fmha/fmha_fwd_runner.hpp
--- a/example/ck_tile/01_fmha/generate.py
+++ b/example/ck_tile/01_fmha/generate.py
@@ -0,0 +1,179 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+# generate kernel instances to speed up compilation
+
+import argparse
+from enum import IntEnum
+from pathlib import Path
+import pkgutil
+from typing import List, Optional
+
+import codegen.ops
+from codegen.cmake_config import GEN_DIR
+
+
+class HandlerId(IntEnum):
+    LIST_BLOBS = 0
+    WRITE_BLOBS = 1
+
+
+# inspect all modules under 'codegen.ops' and register API handlers
+ops = []
+for importer, module_name, _ in pkgutil.iter_modules(codegen.ops.__path__):
+    full_module_name = "%s.%s" % (codegen.ops.__name__, module_name)
+    ops.append(importer.find_spec(module_name).loader.load_module(module_name))
+unwanted_prefix = "fmha_"
+handlers = dict(
+    [
+        (
+            op.__name__[len(unwanted_prefix) :]
+            if op.__name__.startswith(unwanted_prefix)
+            else op.__name__,
+            (op.list_blobs, op.write_blobs),
+        )
+        for op in ops
+    ]
+)
+assert 0 < len(handlers)
+
+
+def write_blobs(
+    targets: List[str],
+    output_dir: Optional[str],
+    api_list: List[str],
+    filters_list: List[str],
+    optdim_list: List[int],
+    receipt,
+    mask_impl,
+) -> None:
+    if output_dir is None:
+        output_dir = Path(__file__).parent
+    else:
+        output_dir = Path(output_dir) / GEN_DIR
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    for api, kernel_filter in zip(api_list, filters_list):
+        handler = handlers[api][HandlerId.WRITE_BLOBS]
+        handler(targets, output_dir, kernel_filter, receipt, optdim_list, mask_impl)
+
+
+# list all the files that will be generated
+def list_blobs(
+    targets: List[str],
+    output_file: Optional[str],
+    api_list: List[str],
+    filters_list: List[str],
+    optdim_list: List[int],
+    receipt,
+    mask_impl,
+) -> None:
+    assert output_file is not None
+    file_path = Path(output_file)
+
+    # create an empty file / drop its contents if it exists
+    open(file_path, "w").close()
+
+    for api, kernel_filter in zip(api_list, filters_list):
+        handler = handlers[api][HandlerId.LIST_BLOBS]
+        handler(targets, file_path, kernel_filter, receipt, optdim_list, mask_impl)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog="generate",
+        description="gen API for CK fmha kernel",
+    )
+    parser.add_argument(
+        "--targets",
+        default="gfx9,gfx950",
+        required=False,
+        help="list of GPU targets, separated by comma.",
+    )
+    parser.add_argument(
+        "-d",
+        "--direction",  # we keep 'direction' option for backward compatibility
+        "-a",
+        "--api",
+        default="fwd",
+        required=False,
+        help="supply API(s) to generate (default: fwd). separated by comma.",
+    )
+    parser.add_argument(
+        "-o",
+        "--output_dir",
+        required=False,
+        help="write all the blobs into a directory",
+    )
+    parser.add_argument(
+        "-l", "--list_blobs", required=False, help="list all the kernels to a file"
+    )
+    # TODO: if using filter, must apply same value to output_dir and list_blobs
+    parser.add_argument(
+        "-f",
+        "--filter",
+        default="",
+        required=False,
+        help="filter out kernels that need to generate, using fnmatch module",
+    )
+
+    parser.add_argument(
+        "-m",
+        "--mask",
+        default="simplified",
+        required=False,
+        help="mask implementation, simplified/generic",
+    )
+
+    parser.add_argument(
+        "-r",
+        "--receipt",
+        default=0,
+        required=False,
+        help="codegen receipt. 0: generate only 8xhdim coverage\n"
+        + "  1: generate more instance to cover all hdim\n"
+        + "  2: Only generate instance for Flash attention integration\n"
+        + "  4: Only generate instance for PyTorch integration\n"
+        + "  100-199: Only generate instance for Aiter(mha_fwd) integration\n"
+        + "  200-299: Only generate instance for Aiter(mha_varlen_fwd) integration\n"
+        + "  300-399: Only generate instance for Aiter(mha_bwd) integration\n"
+        + "  400-499: Only generate instance for Aiter(mha_varlen_bwd) integration\n"
+        + "  600-699: Only generate instance for aiter::mha_fwd && aiter::mha_fwd_splitkv && aiter::mha_bwd C++ api integration",
+    )
+
+    parser.add_argument(
+        "--optdim",
+        default="-1",
+        required=False,
+        help="only optimize the hdim in the list. separated by comma. -1 is the default choice"
+        + "eg. --optdim=32,64,128,256",
+    )
+
+    args = parser.parse_args()
+    targets = args.targets.split(",")
+    api_list = args.direction.split(",")
+    filter_list = args.filter.split(",")
+    filter_list.extend([""] * (len(api_list) - len(filter_list)))
+    optdim_list = [int(hdim) for hdim in args.optdim.split(",")]
+
+    if args.list_blobs is not None:
+        list_blobs(
+            targets,
+            args.list_blobs,
+            api_list,
+            filter_list,
+            optdim_list,
+            int(args.receipt),
+            mask_impl=args.mask,
+        )
+    else:
+        write_blobs(
+            targets,
+            args.output_dir,
+            api_list,
+            filter_list,
+            optdim_list,
+            int(args.receipt),
+            mask_impl=args.mask,
+        )
--- a/example/ck_tile/01_fmha/mask.hpp
+++ b/example/ck_tile/01_fmha/mask.hpp
@@ -0,0 +1,203 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <ostream>
+#include <string>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/fmha.hpp"
+
+// keep this in sync with ck_tile::GenericAttentionMaskEnum
+enum class mask_enum
+{
+    no_mask = 0,
+    mask_top_left,
+    mask_bottom_right,
+    window_generic,
+};
+
+struct mask_info
+{
+    mask_enum type;
+    ck_tile::index_t seqlen_q;
+    ck_tile::index_t seqlen_k;
+    ck_tile::index_t y, x;
+    ck_tile::index_t left, right; // FA style SWA left/right
+    ck_tile::index_t sink;
+
+    void serialize(std::ostream& os) const
+    {
+        if(type == mask_enum::no_mask)
+            os << "n";
+        else if(type == mask_enum::mask_top_left)
+            os << "t(" << left << ":" << right << ")";
+        else if(type == mask_enum::mask_bottom_right)
+            os << "b(" << left << ":" << right << ")";
+        else
+        {
+            os << "g(" << y << ":" << x << ")";
+        }
+    }
+
+    static mask_info decode(std::string str, ck_tile::index_t seqlen_q, ck_tile::index_t seqlen_k)
+    {
+        ck_tile::index_t x_total = seqlen_k;
+        ck_tile::index_t y_total = seqlen_q;
+        mask_info tmp;
+        tmp.seqlen_q = seqlen_q;
+        tmp.seqlen_k = seqlen_k;
+        auto found_0 = str.find(':');
+        if(found_0 != std::string::npos)
+        {
+            std::string t = str.substr(0, found_0);
+            std::string v = str.substr(found_0 + 1);
+            if(t == "xt" || t == "xb")
+            {
+                // xformer style sliding window attn from top-left
+                ck_tile::index_t window_size = std::stoi(v);
+                ck_tile::index_t left_size   = -1;
+                ck_tile::index_t right_size  = 0;
+                ck_tile::index_t sink_size   = 0;
+                if(window_size > 0)
+                {
+                    left_size  = window_size / 2;
+                    right_size = window_size - 1 - left_size;
+                }
+                auto r = ck_tile::make_generic_attention_mask_coordinates_from_lr_window(
+                    left_size, right_size, sink_size, y_total, x_total, t == "xt");
+
+                tmp.type  = t == "xt" ? mask_enum::mask_top_left : mask_enum::mask_bottom_right;
+                tmp.y     = r.at(ck_tile::number<0>{});
+                tmp.x     = r.at(ck_tile::number<1>{});
+                tmp.left  = left_size;
+                tmp.right = right_size;
+                tmp.sink  = 0;
+            }
+            else if(t == "t" || t == "b" || t == "g")
+            {
+                auto found_1 = v.find(",");
+                if(found_1 == std::string::npos)
+                {
+                    throw std::invalid_argument("invalid mask value: " + str);
+                }
+                tmp.type              = mask_enum::window_generic;
+                ck_tile::index_t v0   = atoi(v.substr(0, found_1).c_str());
+                auto found_2          = v.find(',', found_1 + 1);
+                ck_tile::index_t v1   = 0;
+                ck_tile::index_t sink = 0;
+                // ck_tile::index_t v1 = atoi(v.substr(found_1 + 1).c_str());
+                // TODO: some validation
+                if(t == "t")
+                {
+                    if(found_2 != std::string::npos)
+                    {
+                        v1   = atoi(v.substr(found_1 + 1, found_2 - found_1 - 1).c_str());
+                        sink = atoi(v.substr(found_2 + 1).c_str());
+                    }
+                    else
+                    {
+                        v1   = atoi(v.substr(found_1 + 1).c_str());
+                        sink = 0;
+                    }
+                    tmp.type = mask_enum::mask_top_left;
+                    auto r   = ck_tile::make_generic_attention_mask_coordinates_from_lr_window(
+                        v0, v1, sink, y_total, x_total, true);
+                    tmp.y     = r.at(ck_tile::number<0>{});
+                    tmp.x     = r.at(ck_tile::number<1>{});
+                    tmp.left  = v0;
+                    tmp.right = v1;
+                    tmp.sink  = sink;
+                }
+                else if(t == "b")
+                {
+                    if(found_2 != std::string::npos)
+                    {
+                        v1   = atoi(v.substr(found_1 + 1, found_2 - found_1 - 1).c_str());
+                        sink = atoi(v.substr(found_2 + 1).c_str());
+                    }
+                    else
+                    {
+                        v1   = atoi(v.substr(found_1 + 1).c_str());
+                        sink = 0;
+                    }
+                    tmp.type = mask_enum::mask_bottom_right;
+                    auto r   = ck_tile::make_generic_attention_mask_coordinates_from_lr_window(
+                        v0, v1, sink, y_total, x_total, false);
+                    tmp.y     = r.at(ck_tile::number<0>{});
+                    tmp.x     = r.at(ck_tile::number<1>{});
+                    tmp.left  = v0;
+                    tmp.right = v1;
+                    tmp.sink  = sink;
+                }
+                else if(t == "g")
+                {
+                    tmp.type  = mask_enum::window_generic;
+                    tmp.y     = v0;
+                    tmp.x     = v1;
+                    tmp.left  = v0; // TODO: don't use this?
+                    tmp.right = v1;
+                    tmp.sink  = 0;
+                }
+            }
+            else
+            {
+                throw std::invalid_argument("invalid mask value: " + str);
+            }
+        }
+        else if(str == "0")
+        {
+            tmp.type  = mask_enum::no_mask;
+            tmp.left  = -1;
+            tmp.right = -1;
+            tmp.sink  = 0;
+        }
+        else if(str == "1" || str == "t")
+        {
+            tmp.type  = mask_enum::mask_top_left;
+            tmp.y     = seqlen_q;
+            tmp.x     = 1;
+            tmp.left  = -1;
+            tmp.right = 0;
+            tmp.sink  = 0;
+        }
+        else if(str == "2" || str == "b")
+        {
+            tmp.type  = mask_enum::mask_bottom_right;
+            tmp.y     = seqlen_q;
+            tmp.x     = seqlen_k - seqlen_q + 1;
+            tmp.left  = -1;
+            tmp.right = 0;
+            tmp.sink  = 0;
+        }
+        else
+        {
+            throw std::invalid_argument("invalid mask value: " + str);
+        }
+        return tmp;
+    }
+
+    std::size_t get_unmaskarea() const
+    {
+        if(type == mask_enum::no_mask)
+            return static_cast<std::size_t>(seqlen_q) * seqlen_k;
+        std::size_t area = 0;
+        for(ck_tile::index_t i_y = 0; i_y < seqlen_q; ++i_y)
+        {
+            ck_tile::index_t x_start = std::max(-y + i_y + 1, static_cast<ck_tile::index_t>(0));
+            ck_tile::index_t x_end   = std::min(i_y + x, seqlen_k);
+            if(x_end > x_start)
+            {
+                area += (x_end - x_start);
+            }
+        }
+        return area;
+    }
+
+    friend std::ostream& operator<<([[clang::lifetimebound]] std::ostream& os, const mask_info& mi)
+    {
+        mi.serialize(os);
+        return os;
+    }
+};
--- a/example/ck_tile/01_fmha/misc/gamc.png
+++ b/example/ck_tile/01_fmha/misc/gamc.png
--- a/example/ck_tile/01_fmha/quant.hpp
+++ b/example/ck_tile/01_fmha/quant.hpp
@@ -0,0 +1,78 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <ostream>
+#include <string>
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/fmha.hpp"
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wlifetime-safety-intra-tu-suggestions"
+
+// keep sync with BlockAttentionQuantScaleEnum
+enum class quant_scale_enum
+{
+    no_scale      = 0,
+    pertensor     = 1,
+    blockscale    = 2,
+    kv_blockscale = 3, // Q per-tensor, K/V per-page block scale
+    mx            = 4, // Microscaling (MX)
+};
+
+struct quant_scale_info
+{
+    quant_scale_enum type;
+
+    void serialize(std::ostream& os) const
+    {
+        if(type == quant_scale_enum::no_scale)
+            os << "n";
+        else if(type == quant_scale_enum::pertensor)
+            os << "pt";
+        else if(type == quant_scale_enum::blockscale)
+            os << "bs";
+        else if(type == quant_scale_enum::kv_blockscale)
+            os << "kvbs";
+        else if(type == quant_scale_enum::mx)
+            os << "mx";
+    }
+
+    static quant_scale_info decode(std::string str)
+    {
+        quant_scale_info info{quant_scale_enum::no_scale};
+        if(str == "n" || str == "0")
+        {
+            info.type = quant_scale_enum::no_scale;
+        }
+        else if(str == "pt" || str == "1")
+        {
+            info.type = quant_scale_enum::pertensor;
+        }
+        else if(str == "bs" || str == "2")
+        {
+            info.type = quant_scale_enum::blockscale;
+        }
+        else if(str == "kvbs" || str == "3")
+        {
+            info.type = quant_scale_enum::kv_blockscale;
+        }
+        else if(str == "mx" || str == "4")
+        {
+            info.type = quant_scale_enum::mx;
+        }
+        else
+        {
+            throw std::invalid_argument("invalid quant scale value: " + str);
+        }
+        return info;
+    }
+
+    friend std::ostream& operator<<(std::ostream& os, const quant_scale_info& qsi)
+    {
+        qsi.serialize(os);
+        return os;
+    }
+};
+#pragma clang diagnostic pop
--- a/example/ck_tile/01_fmha/rotary.hpp
+++ b/example/ck_tile/01_fmha/rotary.hpp
@@ -0,0 +1,89 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+
+#include <cassert>
+#include <cmath>
+
+#ifndef M_PI // Not there on windows...
+#define M_PI 3.141592653589793238462643383279502884
+#endif
+
+#include <functional>
+#include <iterator>
+#include <optional>
+#include <random>
+#include <tuple>
+
+// keep sync with RotaryEmbeddingEnum
+enum class rope_enum
+{
+    none         = 0,
+    interleaved  = 1,
+    half_rotated = 2,
+};
+
+template <typename DataType>
+std::tuple<ck_tile::HostTensor<DataType>, ck_tile::HostTensor<DataType>>
+generate_rotary_cos_sin(ck_tile::index_t seqlen,
+                        ck_tile::index_t rotary_dim,
+                        std::optional<unsigned> seed = std::nullopt)
+{
+    // return dummy tensors if we won't apply RoPE at all
+    if(rotary_dim <= 0)
+    {
+        ck_tile::HostTensor<DataType> dummy({1, 1});
+        return std::make_tuple(dummy, dummy);
+    }
+
+    std::mt19937 random_engine(seed.has_value() ? *seed : std::random_device{}());
+    std::uniform_real_distribution<float> generator(0.0f, 1.0f);
+
+    const ck_tile::index_t num_rows = seqlen * 2;
+    const ck_tile::index_t num_cols = rotary_dim / 2;
+
+    using std::begin, std::end;
+
+    ck_tile::HostTensor<float> angle({num_rows, num_cols});
+    std::generate(begin(angle), end(angle), [&] { return generator(random_engine) * 2 * M_PI; });
+
+    ck_tile::HostTensor<DataType> cos({num_rows, num_cols});
+    std::transform(begin(angle), end(angle), begin(cos), [](float origin_value) {
+        return ck_tile::type_convert<DataType>(std::cos(origin_value));
+    });
+
+    ck_tile::HostTensor<DataType> sin({num_rows, num_cols});
+    std::transform(begin(angle), end(angle), begin(sin), [](float origin_value) {
+        return ck_tile::type_convert<DataType>(std::sin(origin_value));
+    });
+
+    return std::make_tuple(cos, sin);
+}
+
+template <typename DataType>
+std::tuple<ck_tile::HostTensor<DataType>, ck_tile::HostTensor<DataType>>
+slice_rotary_cos_sin(const ck_tile::HostTensor<DataType>& cos,
+                     const ck_tile::HostTensor<DataType>& sin,
+                     ck_tile::index_t seqlen_offset,
+                     ck_tile::index_t seqlen)
+{
+    assert(cos.get_num_of_dimension() == 2 && sin.get_num_of_dimension() == 2);
+    assert(cos.get_length(0) == sin.get_length(0) && cos.get_length(1) == sin.get_length(1));
+
+    assert(static_cast<std::size_t>(seqlen_offset + seqlen) <= cos.get_length(0));
+
+    const ck_tile::index_t num_rows = seqlen;
+    const ck_tile::index_t num_cols = cos.get_length(1);
+
+    ck_tile::HostTensor<DataType> cos_pt({num_rows, num_cols});
+    cos_pt.ForEach([&](auto& self, auto i) { self(i) = cos(i[0] + seqlen_offset, i[1]); });
+
+    ck_tile::HostTensor<DataType> sin_pt({num_rows, num_cols});
+    sin_pt.ForEach([&](auto& self, auto i) { self(i) = sin(i[0] + seqlen_offset, i[1]); });
+
+    return std::make_tuple(cos_pt, sin_pt);
+}
--- a/example/ck_tile/01_fmha/script/benchmark_bwd.sh
+++ b/example/ck_tile/01_fmha/script/benchmark_bwd.sh
@@ -0,0 +1,23 @@
+#!/bin/sh
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+# TODO: run this script from CK root or build directory
+EXE="$(find . -name tile_example_fmha_bwd -type f | head -n 1)"
+VALID=0
+
+for prec in "fp16" "bf16" ; do
+for perm in 0 1 ; do
+for hdim in 32 64 128 ; do
+
+nhead=$((2048 / $hdim))     # follow fav2 setup
+$EXE -prec=$prec -b=32 -h=$nhead -d=$hdim -s=512   -iperm=$perm -operm=$perm -kname=1 -v=$VALID ; sleep 3
+$EXE -prec=$prec -b=16 -h=$nhead -d=$hdim -s=1024  -iperm=$perm -operm=$perm -kname=1 -v=$VALID ; sleep 3
+$EXE -prec=$prec -b=8  -h=$nhead -d=$hdim -s=2048  -iperm=$perm -operm=$perm -kname=1 -v=$VALID ; sleep 3
+$EXE -prec=$prec -b=4  -h=$nhead -d=$hdim -s=4096  -iperm=$perm -operm=$perm -kname=1 -v=$VALID ; sleep 3
+$EXE -prec=$prec -b=2  -h=$nhead -d=$hdim -s=8192  -iperm=$perm -operm=$perm -kname=1 -v=$VALID ; sleep 3
+$EXE -prec=$prec -b=1  -h=$nhead -d=$hdim -s=16384 -iperm=$perm -operm=$perm -kname=1 -v=$VALID ; sleep 3
+
+done
+done
+done
--- a/example/ck_tile/01_fmha/script/benchmark_fwd.sh
+++ b/example/ck_tile/01_fmha/script/benchmark_fwd.sh
@@ -0,0 +1,56 @@
+#!/bin/sh
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+# TODO: run this script from CK root or build directory
+EXE="$(find . -name tile_example_fmha_fwd -type f | head -n 1)"
+VALID=0
+
+for prec in "fp16" "bf16" ; do
+for perm in 0 1 ; do
+for hdim in 64 128 256 ; do
+
+nhead=$((2048 / $hdim))     # follow fav2 setup
+$EXE -prec=$prec -b=32 -h=$nhead -d=$hdim -s=512   -iperm=$perm -operm=$perm -kname=1 -v=$VALID ; sleep 3
+$EXE -prec=$prec -b=16 -h=$nhead -d=$hdim -s=1024  -iperm=$perm -operm=$perm -kname=1 -v=$VALID ; sleep 3
+$EXE -prec=$prec -b=8  -h=$nhead -d=$hdim -s=2048  -iperm=$perm -operm=$perm -kname=1 -v=$VALID ; sleep 3
+$EXE -prec=$prec -b=4  -h=$nhead -d=$hdim -s=4096  -iperm=$perm -operm=$perm -kname=1 -v=$VALID ; sleep 3
+$EXE -prec=$prec -b=2  -h=$nhead -d=$hdim -s=8192  -iperm=$perm -operm=$perm -kname=1 -v=$VALID ; sleep 3
+$EXE -prec=$prec -b=1  -h=$nhead -d=$hdim -s=16384 -iperm=$perm -operm=$perm -kname=1 -v=$VALID ; sleep 3
+
+done
+done
+done
+
+#Padding Benchmarks: batch mode (baseline vs low/med/high pad)
+prec="fp16"
+base_batch_args="-prec=$prec -mode=0 -b=4 -h=16 -h_k=16 -d=128 -s=1024 -bias=n -mask=0 -lse=0 -iperm=0 -operm=0 -vlayout=r -kname=1 -v=$VALID"
+
+# baseline (no pad)
+$EXE $base_batch_args
+
+# low pad (≈90–95% effective)
+$EXE $base_batch_args -q_eff_lens=1024,960,992,896 -kv_eff_lens=1024,960,992,896
+
+# medium pad (≈60–75% effective)
+$EXE $base_batch_args -q_eff_lens=896,768,512,640 -kv_eff_lens=896,768,512,640
+
+# high pad (≈30–40% effective)
+$EXE $base_batch_args -q_eff_lens=512,384,256,320 -kv_eff_lens=512,384,256,320
+
+# Padding Benchmarks: group mode (baseline vs low/med/high physical pad)
+seqlens_q="1024,768,512,256"
+seqlens_k="1024,768,512,256"
+base_group_args="-prec=$prec -mode=1 -b=4 -h=16 -h_k=16 -d=128 -s=$seqlens_q -s_k=$seqlens_k -bias=n -mask=0 -lse=0 -iperm=0 -operm=0 -vlayout=r -kname=1 -v=$VALID"
+
+# baseline (no physical pad)
+$EXE $base_group_args
+
+# low physical pad
+$EXE $base_group_args -s_qpad=1152,896,576,320 -s_kpad=1152,896,576,320
+
+# medium physical pad
+$EXE $base_group_args -s_qpad=1536,1152,768,384 -s_kpad=1536,1152,768,384
+
+# high physical pad
+$EXE $base_group_args -s_qpad=2048,1536,1024,512 -s_kpad=2048,1536,1024,512
--- a/example/ck_tile/01_fmha/script/benchmark_fwd_v3.sh
+++ b/example/ck_tile/01_fmha/script/benchmark_fwd_v3.sh
@@ -0,0 +1,46 @@
+#!/bin/sh
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+
+# TODO: run this script from CK root or build directory
+EXE="$(find . -name tile_example_fmha_fwd_v3 -type f | head -n 1)"
+VALID=0
+
+for causal in 0 1 ; do
+for prec in "fp16" "bf16" ; do
+for hdim in 128 ; do
+for perm in 0 ; do
+
+$EXE -prec=$prec -b=32 -h=16        -s=512   -d=$hdim -causal=$causal -iperm=$perm -operm=$perm -v=$VALID
+$EXE -prec=$prec -b=16 -h=16        -s=1024  -d=$hdim -causal=$causal -iperm=$perm -operm=$perm -v=$VALID
+$EXE -prec=$prec -b=8  -h=16        -s=2048  -d=$hdim -causal=$causal -iperm=$perm -operm=$perm -v=$VALID
+$EXE -prec=$prec -b=4  -h=16        -s=4096  -d=$hdim -causal=$causal -iperm=$perm -operm=$perm -v=$VALID
+$EXE -prec=$prec -b=2  -h=16        -s=8192  -d=$hdim -causal=$causal -iperm=$perm -operm=$perm -v=$VALID
+$EXE -prec=$prec -b=1  -h=16        -s=16384 -d=$hdim -causal=$causal -iperm=$perm -operm=$perm -v=$VALID
+                                          
+$EXE -prec=$prec -b=1  -h=64        -s=16384 -d=$hdim -causal=$causal -iperm=$perm -operm=$perm -v=$VALID
+$EXE -prec=$prec -b=1  -h=16 -h_k=1 -s=65536 -d=$hdim -causal=$causal -iperm=$perm -operm=$perm -v=$VALID
+$EXE -prec=$prec -b=1  -h=40        -s=37200 -d=$hdim -causal=$causal -iperm=$perm -operm=$perm -v=$VALID
+
+done
+done
+done
+done
+
+# Padding benchmark comparisons for v3 (batch mode only)
+# ==== V3 Padding Benchmarks: batch mode (baseline vs low/med/high pad) ====
+prec="fp16"
+base_v3_args="-prec=$prec -b=4 -h=16 -d=128 -s=1024 -mask=0 -iperm=0 -operm=0 -v=$VALID"
+
+# baseline (no pad)
+$EXE $base_v3_args
+
+# low pad (≈90–95% effective)
+$EXE $base_v3_args -q_eff_lens=1024,960,992,896 -kv_eff_lens=1024,960,992,896
+
+# medium pad (≈60–75% effective)
+$EXE $base_v3_args -q_eff_lens=896,768,512,640 -kv_eff_lens=896,768,512,640
+
+# high pad (≈30–40% effective)
+$EXE $base_v3_args -q_eff_lens=512,384,256,320 -kv_eff_lens=512,384,256,320
--- a/example/ck_tile/01_fmha/script/correct_test_fwd_sink.sh
+++ b/example/ck_tile/01_fmha/script/correct_test_fwd_sink.sh
@@ -0,0 +1,77 @@
+#!/bin/bash
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+# TODO: run this script from CK root or build directory
+EXE="$(find . -name tile_example_fmha_fwd -type f | head -n 1)"
+KNAME=1
+
+export CK_WARMUP=0
+export CK_REPEAT=1
+
+COMMON_ARGS='-v=1 -warmup=0 -repeat=1'
+# mode=0
+# export HIP_VISIBLE_DEVICES=4
+
+TEST_SPLITKV=0
+TEST_APPENDKV=0
+# options:
+#    -s: run splitkv tests
+#    -a: run appendkv tests
+while getopts ":sa" opt; do
+    case "${opt}" in
+        s)
+            TEST_SPLITKV=1
+            ;;
+        a)
+            TEST_APPENDKV=1
+            ;;
+        *)
+            ;;
+    esac
+done
+
+run_fp16_bf16_tests() {
+    local NUM_SPLITS="1"
+    local PAGE_BLOCK_SIZE="0"
+    local CACHE_BATCH_IDX="0"
+
+    if [ $TEST_SPLITKV -eq 1 ] ; then
+        NUM_SPLITS="$NUM_SPLITS 2 3"
+        PAGE_BLOCK_SIZE="$PAGE_BLOCK_SIZE 128"
+        CACHE_BATCH_IDX="$CACHE_BATCH_IDX 1"
+    fi
+
+    for prec in "fp16"; do 
+    for mode in 1 0 ; do
+    for perm in 0 1 ; do
+    for vlayout in "r" "c" ; do
+    for batch in 1 4; do
+    for head in 1; do
+    for h_k in 1; do
+    for q_seq in 128 512 ; do
+    for kv_seq in 128 1024; do
+    for hdim in 32 64 128 256; do #256 
+    for lse in 0 1 ; do
+    for bias in "e" ; do
+    for p_drop in 0.0 0.2; do # 0.0   
+    for mask in "t:2,0,4" "b:1,0,2"; do
+    for num_splits in $NUM_SPLITS ; do
+    for page_block_size in $PAGE_BLOCK_SIZE ; do
+    for cache_batch_idx in $CACHE_BATCH_IDX ; do
+
+    # $EXE -prec=$prec -mode=$mode -b=1 -h=1 -d=$hdim -s=1024 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -kname=$KNAME $COMMON_ARGS  
+    $EXE -prec=$prec -mode=$mode -b=$batch -h=$head -h_k=$h_k -d=16 -d_v=$hdim -s=$q_seq -s_k=$kv_seq -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -vlayout=$vlayout -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS -mask=$mask
+
+    done ; done ; done ; done ; done
+    done ; done ; done ; done ; done
+    done ; done ; done ; done ; done
+    done ; done
+}
+
+
+set -x
+
+run_fp16_bf16_tests
+
+set +x
--- a/example/ck_tile/01_fmha/script/fmha_bwd_known_fails_gfx1201.txt
+++ b/example/ck_tile/01_fmha/script/fmha_bwd_known_fails_gfx1201.txt
--- a/example/ck_tile/01_fmha/script/fmha_bwd_known_fails_gfx90a.txt
+++ b/example/ck_tile/01_fmha/script/fmha_bwd_known_fails_gfx90a.txt
--- a/example/ck_tile/01_fmha/script/fmha_bwd_known_fails_gfx942.txt
+++ b/example/ck_tile/01_fmha/script/fmha_bwd_known_fails_gfx942.txt
--- a/example/ck_tile/01_fmha/script/fmha_bwd_known_fails_gfx950.txt
+++ b/example/ck_tile/01_fmha/script/fmha_bwd_known_fails_gfx950.txt
--- a/example/ck_tile/01_fmha/script/fmha_fwd_known_fails_gfx90a.txt
+++ b/example/ck_tile/01_fmha/script/fmha_fwd_known_fails_gfx90a.txt
--- a/example/ck_tile/01_fmha/script/fmha_fwd_known_fails_gfx942.txt
+++ b/example/ck_tile/01_fmha/script/fmha_fwd_known_fails_gfx942.txt
--- a/example/ck_tile/01_fmha/script/fmha_fwd_known_fails_gfx950.txt
+++ b/example/ck_tile/01_fmha/script/fmha_fwd_known_fails_gfx950.txt
--- a/example/ck_tile/01_fmha/script/run_full_test.sh
+++ b/example/ck_tile/01_fmha/script/run_full_test.sh
@@ -0,0 +1,52 @@
+#!/bin/bash 
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+#
+# in order to run this script you'd first need to build the tile_example_fmha_fwd and tile_eaxmple_fmha_bwd executables in ../build/bin/
+#
+# run the script as "./run_full_test.sh <tag for your test environment> <branch name> <host name> <gpu_arch>
+# input arguments: 
+# environment tag  : a string describing the specifics of your test environment
+# branch name      : name of the branch in git repo (git status | grep -e 'On branch')
+# host name        : $hostname
+# gpu architecture: e.g., gfx90a, or gfx942, etc.
+
+set -euo pipefail
+
+#get the command line arguments:
+export env_type=$1
+echo 'Environment type: ' $env_type
+export branch=$2
+echo 'Branch name: ' $branch
+export host_name=$3
+echo 'Host name: ' $host_name
+export GPU_arch=$4
+echo 'GPU_arch: ' $GPU_arch
+
+function print_log_header(){
+	rm -f $1;
+	echo 'On branch ' $3 &> $1;
+	echo 'Node name: ' $4 >> $1;
+	#get GPU_arch and number of compute units from rocminfo
+	echo -n "GPU_arch: " >> $1; rocminfo | grep "Name:" | grep "gfx" >> $1;
+	rocminfo | grep "Compute Unit:" >> $1;
+	hipcc --version | grep -e 'HIP version'  >> $1;
+	echo 'Environment type: ' $2 >> $1;
+	/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> $1;
+}
+
+#run verification tests
+time example/ck_tile/01_fmha/script/smoke_test_fwd.sh
+time example/ck_tile/01_fmha/script/smoke_test_bwd.sh
+time example/ck_tile/01_fmha/script/smoke_test_fwd_sink.sh
+
+#run performance benchmarks
+export fmha_fwd_log="perf_fmha_fwd_$GPU_arch.log"
+print_log_header $fmha_fwd_log $env_type $branch $host_name
+time example/ck_tile/01_fmha/script/benchmark_fwd.sh 2>&1 | tee -a $fmha_fwd_log
+
+export fmha_bwd_log="perf_fmha_bwd_$GPU_arch.log"
+print_log_header $fmha_bwd_log $env_type $branch $host_name
+time example/ck_tile/01_fmha/script/benchmark_bwd.sh 2>&1 | tee -a $fmha_bwd_log
+
--- a/example/ck_tile/01_fmha/script/smoke_test_bwd.sh
+++ b/example/ck_tile/01_fmha/script/smoke_test_bwd.sh
@@ -0,0 +1,93 @@
+#!/bin/bash
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+# TODO: run this script from CK root or build directory
+set -euo pipefail
+
+SCRIPT_DIR=$(cd $(dirname "${BASH_SOURCE[0]}") && pwd)
+EXE_NAME=tile_example_fmha_bwd
+EXE="$(find . -name $EXE_NAME -type f | head -n 1)"
+KNAME=1
+GPU_arch=${GPU_arch:-""}
+if [ -z "$GPU_arch" ] ; then
+    GPU_arch=$(rocminfo | grep -E 'Name:\s+gfx' | head -n1 | awk '{print $2}')
+fi
+
+export CK_WARMUP=0
+export CK_REPEAT=1
+
+CURR_FAILS_FILE=${CURR_FAILS_FILE:-"fmha_bwd_fails_$GPU_arch.txt"}
+rm -f $CURR_FAILS_FILE
+touch $CURR_FAILS_FILE
+KNOWN_FAILS_FILE=${KNOWN_FAILS_FILE:-"$SCRIPT_DIR/fmha_bwd_known_fails_$GPU_arch.txt"}
+
+COMMON_ARGS='-v=1'
+
+run_exe() {
+    set +ex
+    $EXE $@
+    local ret=$?
+    if [ $ret -ne 0 ] ; then
+        echo "$EXE_NAME $*" >> $CURR_FAILS_FILE
+    fi
+    set -ex
+}
+
+test_h_s_mask() {
+    run_exe -b=1 -h=4 -h_k=2 -s=259                         $@
+    run_exe -b=2 -h=2        -s=516 -s_k=253                $@
+    run_exe -b=1 -h=4 -h_k=1 -s=500 -s_k=251 -mask=1        $@
+    run_exe -b=1 -h=2        -s=900 -s_k=258 -mask=2        $@
+    run_exe -b=2 -h=1        -s=987 -s_k=219 -mask=t:128,30 $@
+    run_exe -b=2 -h=3 -h_k=1 -s=244 -s_k=499 -mask=b:4,35   $@
+}
+
+set -x
+# main tests
+for prec in "fp16" "bf16" ; do
+for perm in 0 1 ; do
+for hdim in 32 64 128 256 ; do
+for mode in 0 1 ; do
+for bias in "n" "a" ; do
+for dbias in 0 ; do
+for p_drop in 0.0 0.2 ; do
+for deterministic in 0 ; do
+test_h_s_mask -prec=$prec -d=$hdim -bias=$bias -dbias=$dbias -p_drop=$p_drop -iperm=$perm -operm=$perm -deterministic=$deterministic -v=1 -mode=$mode -kname=$KNAME $COMMON_ARGS
+done
+done
+done
+done
+done
+done
+done
+done
+
+# additional cases
+for hdim in 40 48 72 96 ; do
+test_h_s_mask -prec=fp16 -d=$hdim -bias=a -dbias=0 -p_drop=0.2 -iperm=0 -operm=0 -deterministic=0 -v=1 -mode=1 -kname=$KNAME $COMMON_ARGS
+test_h_s_mask -prec=bf16 -d=$hdim -bias=n -dbias=0 -p_drop=0   -iperm=1 -operm=1 -deterministic=0 -v=1 -mode=1 -kname=$KNAME $COMMON_ARGS
+test_h_s_mask -prec=bf16 -d=$hdim -bias=a -dbias=0 -p_drop=0.2 -iperm=1 -operm=1 -deterministic=0 -v=1 -mode=1 -kname=$KNAME $COMMON_ARGS
+done
+set +x
+
+new_fails_count=0
+known_fails_count=0
+if [ -f $KNOWN_FAILS_FILE ] ; then
+    echo "Comparing current fails ($CURR_FAILS_FILE) against known fails ($KNOWN_FAILS_FILE):"
+    while IFS= read -r line; do
+        if grep -Fxq "$line" $KNOWN_FAILS_FILE; then
+            echo "Known fail: $line"
+            known_fails_count=$(($known_fails_count + 1))
+        else
+            echo "New fail: $line"
+            new_fails_count=$(($new_fails_count + 1))
+        fi
+    done < $CURR_FAILS_FILE
+else
+    new_fails_count=$(wc -l < $CURR_FAILS_FILE)
+    echo "No known fails file, all fails ($new_fails_count) are new:"
+    cat $CURR_FAILS_FILE
+fi
+echo "New fails count: $new_fails_count; Known fails count: $known_fails_count"
+exit $(($new_fails_count != 0))
--- a/example/ck_tile/01_fmha/script/smoke_test_fwd.sh
+++ b/example/ck_tile/01_fmha/script/smoke_test_fwd.sh
@@ -0,0 +1,271 @@
+#!/bin/bash
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+# TODO: run this script from CK root or build directory
+set -euo pipefail
+
+SCRIPT_DIR=$(cd $(dirname "${BASH_SOURCE[0]}") && pwd)
+EXE_NAME=tile_example_fmha_fwd
+EXE="$(find . -name $EXE_NAME -type f | head -n 1)"
+KNAME=1
+GPU_arch=$GPU_arch
+if [ -z "$GPU_arch" ] ; then
+    GPU_arch=$(rocminfo | grep -E 'Name:\s+gfx' | head -n1 | awk '{print $2}')
+fi
+
+export CK_WARMUP=0
+export CK_REPEAT=1
+
+CURR_FAILS_FILE=${CURR_FAILS_FILE:-"fmha_fwd_fails_$GPU_arch.txt"}
+rm -f $CURR_FAILS_FILE
+touch $CURR_FAILS_FILE
+KNOWN_FAILS_FILE=${KNOWN_FAILS_FILE:-"$SCRIPT_DIR/fmha_fwd_known_fails_$GPU_arch.txt"}
+
+COMMON_ARGS='-v=1 -warmup=0 -repeat=1'
+# mode=0
+# export HIP_VISIBLE_DEVICES=4
+
+TEST_SPLITKV=0
+TEST_APPENDKV=0
+# options:
+#    -s: run splitkv tests
+#    -a: run appendkv tests
+while getopts ":sa" opt; do
+    case "${opt}" in
+        s)
+            TEST_SPLITKV=1
+            ;;
+        a)
+            TEST_APPENDKV=1
+            ;;
+        *)
+            ;;
+    esac
+done
+
+run_exe() {
+    set +ex
+    $EXE $@
+    local ret=$?
+    if [ $ret -ne 0 ] ; then
+        echo "$EXE_NAME $*" >> $CURR_FAILS_FILE
+    fi
+    set -ex
+}
+
+run_fp16_bf16_tests() {
+    local NUM_SPLITS="1"
+    local PAGE_BLOCK_SIZE="0"
+    local CACHE_BATCH_IDX="0"
+
+    if [ $TEST_SPLITKV -eq 1 ] ; then
+        NUM_SPLITS="$NUM_SPLITS 2 3"
+        PAGE_BLOCK_SIZE="$PAGE_BLOCK_SIZE 128"
+        CACHE_BATCH_IDX="$CACHE_BATCH_IDX 1"
+    fi
+
+    for prec in "fp16" "bf16" ; do
+    for mode in 1 0 ; do
+    for perm in 0 1 ; do
+    for hdim in 32 64 128 256 ; do
+    for lse in 0 1 ; do
+    for bias in "n" "e" "a" ; do
+    for p_drop in 0.0 0.2 ; do
+    for num_splits in $NUM_SPLITS ; do
+    for page_block_size in $PAGE_BLOCK_SIZE ; do
+    for cache_batch_idx in $CACHE_BATCH_IDX ; do
+
+    # run_exe -prec=$prec -mode=$mode -b=1 -h=1 -d=$hdim -s=1024 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm  -num_splits=$num_splits -page_block_size=$page_block_size -kname=$KNAME $COMMON_ARGS
+    run_exe -prec=$prec -mode=$mode -b=2 -h=2 -h_k=1 -d=16    -d_v=$hdim -s=55   -s_k=256            -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm                -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS
+    run_exe -prec=$prec -mode=$mode -b=1 -h=3        -d=$hdim            -s=100  -s_k=51             -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm                -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS
+    run_exe -prec=$prec -mode=$mode -b=2 -h=1        -d=16    -d_v=$hdim -s=99   -s_k=256            -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=1        -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS
+    run_exe -prec=$prec -mode=$mode -b=1 -h=2 -h_k=1 -d=$hdim            -s=1024 -s_k=256            -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2        -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS
+    run_exe -prec=$prec -mode=$mode -b=2 -h=1        -d=$hdim -d_v=24    -s=3    -s_k=99             -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2        -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS
+    run_exe -prec=$prec -mode=$mode -b=3 -h=2 -h_k=1 -d=$hdim            -s=200  -s_k=520            -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=t:128,30 -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS
+    run_exe -prec=$prec -mode=$mode -b=2 -h=1        -d=$hdim            -s=99   -s_k=32             -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=b:4,35   -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS
+    run_exe -prec=$prec -mode=$mode -b=1 -h=2 -h_k=1 -d=$hdim            -s=33   -s_k=0              -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2        -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS
+    run_exe -prec=$prec -mode=$mode -b=1 -h=2 -h_k=1 -d=$hdim            -s=1    -s_k=10  -s_kpad=32 -bias=$bias -p_drop=$p_drop -lse=$lse -iperm=$perm -operm=$perm -mask=2        -num_splits=$num_splits -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -kname=$KNAME $COMMON_ARGS
+
+    done ; done ; done ; done ; done
+    done ; done ; done ; done ; done
+}
+
+run_fp8bf16_tests() {
+    for perm in 0 1 ; do
+    for b in 1 2 ; do
+    for hdim in 64 128 256 ; do
+    for scale in 1 2; do
+
+    $EXE -prec=fp8bf16 -init=3 -b=$b -h=1 -d=$hdim -s=128 -iperm=$perm -operm=$perm -vlayout=r -qscale=$scale -kname=$KNAME $COMMON_ARGS
+
+    done ; done ; done ; done
+}
+
+run_fp8fp32_tests() {
+    for perm in 0 1 ; do
+    for b in 1 2 ; do
+    for hdim in 128 ; do
+
+    $EXE -prec=fp8fp32 -init=3 -b=$b -h=1 -d=$hdim -s=128 -iperm=$perm -operm=$perm -vlayout=r -qscale=1 -kname=$KNAME $COMMON_ARGS
+
+    done ; done ; done
+}
+
+run_fp16_appendkv_tests() {
+    for s in $(seq 63 1 65) ; do
+    for s_k in 65 129 ; do
+    for s_knew in 0 64 $s_k ; do
+    for hdim in 32 64 128 256 ; do
+    for ri in 0 1 ; do
+    for rdim in 0 16 32 $hdim ; do
+    for page_block_size in 0 128 ; do
+    for cache_batch_idx in 0 1 ; do
+
+    run_exe -prec=fp16 -b=3 -h=3 -d=$hdim -s=$s -s_k=$s_k -s_knew=$s_knew -rotary_dim=$rdim -rotary_interleaved=$ri -page_block_size=$page_block_size -cache_batch_idx=$cache_batch_idx -iperm=1 -operm=1 -kname=1 $COMMON_ARGS
+
+    done ; done ; done ; done ; done
+    done ; done ; done
+}
+
+run_padding_smoke_tests() {
+    # Padding-only smoke tests for batch/group mode using COMMON_ARGS
+    local prec="fp16"
+
+    # Batch mode: padding via effective lengths (exclude PAD)
+    # Use lse=1 to select a non-trload kernel and avoid overly strict tolerance mismatches
+    local base_batch="-prec=$prec -mode=0 -b=4 -h=16 -h_k=16 -d=128 -s=1024 -bias=n -mask=0 -lse=1 -iperm=0 -operm=0 -vlayout=r -kname=$KNAME $COMMON_ARGS"
+    # low pad (≈90–95% effective)
+    $EXE $base_batch -q_eff_lens=1024,960,992,896 -kv_eff_lens=1024,960,992,896
+    # medium pad (≈60–75% effective)
+    $EXE $base_batch -q_eff_lens=896,768,512,640 -kv_eff_lens=896,768,512,640
+    # high pad (≈30–40% effective)
+    $EXE $base_batch -q_eff_lens=512,384,256,320 -kv_eff_lens=512,384,256,320
+
+    # Group mode: padding via physical stride along seqlen
+    local seqlens_q="1024,768,512,256"
+    local seqlens_k="1024,768,512,256"
+    local base_group="-prec=$prec -mode=1 -b=4 -h=16 -h_k=16 -d=128 -s=$seqlens_q -s_k=$seqlens_k -bias=n -mask=0 -lse=0 -iperm=0 -operm=0 -vlayout=r -kname=$KNAME $COMMON_ARGS"
+    # low physical pad
+    $EXE $base_group -s_qpad=1152,896,576,320 -s_kpad=1152,896,576,320
+    # medium physical pad
+    $EXE $base_group -s_qpad=1536,1152,768,384 -s_kpad=1536,1152,768,384
+    # high physical pad
+    $EXE $base_group -s_qpad=2048,1536,1024,512 -s_kpad=2048,1536,1024,512
+}
+
+run_padding_basic_boundary_tests() {
+    # Basic padding and boundary tests (reference: smoke_test_fwd_pad.sh)
+    local prec
+    local perm
+
+    # Group mode: Q&K padded with per-batch different strides
+    for prec in fp16 bf16 ; do
+      for perm in 0 1 ; do
+        $EXE -prec=$prec -mode=1 -b=2 -h=2 -h_k=1 -d=16 -d_v=32 \
+             -s=55 -s_k=256 -s_qpad=64,60 -s_kpad=272,260 \
+             -bias=n -p_drop=0.0 -lse=0 -iperm=$perm -operm=$perm \
+             -num_splits=1 -page_block_size=0 -cache_batch_idx=0 -kname=$KNAME $COMMON_ARGS
+      done
+    done
+
+    # slightly larger, uneven padding strides
+    for prec in fp16 bf16 ; do
+      for perm in 0 1 ; do
+        $EXE -prec=$prec -mode=1 -b=3 -h=2 -h_k=1 -d=64 -d_v=64 \
+             -s=50,60,40 -s_k=128,256,192 -s_qpad=64,64,64 -s_kpad=160,288,224 \
+             -bias=n -p_drop=0.0 -lse=1 -iperm=$perm -operm=$perm \
+             -num_splits=1 -page_block_size=0 -cache_batch_idx=0 -kname=$KNAME $COMMON_ARGS
+      done
+    done
+
+    # only K padded; Q unpadded
+    for prec in fp16 bf16 ; do
+      for perm in 0 1 ; do
+        $EXE -prec=$prec -mode=1 -b=2 -h=2 -h_k=1 -d=32 -d_v=64 \
+             -s=55 -s_k=256 -s_kpad=272,260 \
+             -bias=n -p_drop=0.0 -lse=1 -iperm=$perm -operm=$perm \
+             -num_splits=1 -page_block_size=0 -cache_batch_idx=0 -kname=$KNAME $COMMON_ARGS
+      done
+    done
+
+    # use cu_seqlen overrides to skip tail PAD
+    for prec in fp16 bf16 ; do
+      for perm in 0 1 ; do
+        $EXE -prec=$prec -mode=0 -b=4 -h=8 -h_k=8 -d=128 -s=3 -s_k=3 \
+             -q_eff_lens=1,2,1,2 -kv_eff_lens=1,2,1,2 \
+             -bias=n -p_drop=0.0 -lse=1 -iperm=$perm -operm=$perm \
+             -num_splits=1 -page_block_size=0 -cache_batch_idx=0 -kname=$KNAME $COMMON_ARGS
+
+        $EXE -prec=$prec -mode=0 -b=2 -h=2 -h_k=1 -d=32 -d_v=64 -s=64 -s_k=256 \
+             -q_eff_lens=55,60 -kv_eff_lens=200,256 \
+             -bias=n -p_drop=0.0 -lse=0 -iperm=$perm -operm=$perm \
+             -num_splits=1 -page_block_size=0 -cache_batch_idx=0 -kname=$KNAME $COMMON_ARGS
+      done
+    done
+
+    # no padding (equal), mixed Q/KV, all len=1
+    for prec in fp16 bf16 ; do
+      $EXE -prec=$prec -mode=0 -b=4 -h=8 -d=64 -s=128 -s_k=128 \
+           -q_eff_lens=128,128,128,128 -kv_eff_lens=128,128,128,128 \
+           -bias=n -p_drop=0.0 -lse=1 -kname=$KNAME $COMMON_ARGS
+
+      $EXE -prec=$prec -mode=0 -b=4 -h=8 -d=64 -s=128 -s_k=128 \
+           -q_eff_lens=10,20,30,40 -kv_eff_lens=40,30,20,10 \
+           -bias=n -p_drop=0.0 -lse=1 -kname=$KNAME $COMMON_ARGS
+
+      $EXE -prec=$prec -mode=0 -b=4 -h=8 -d=64 -s=128 -s_k=128 \
+           -q_eff_lens=1,1,1,1 -kv_eff_lens=1,1,1,1 \
+           -bias=n -p_drop=0.0 -lse=1 -kname=$KNAME $COMMON_ARGS
+    done
+
+    # highly variable logical lengths
+    for prec in fp16 bf16 ; do
+      $EXE -prec=$prec -mode=1 -b=4 -h=4 -d=32 \
+           -s=1,127,3,65 -s_k=1,127,3,65 -s_kpad=128 \
+           -bias=n -p_drop=0.0 -lse=1 -kname=$KNAME $COMMON_ARGS
+    done
+
+    # GQA + Alibi + Causal mask (keep vlayout row-major for fp16/bf16
+    for prec in fp16 bf16 ; do
+      $EXE -prec=$prec -mode=1 -b=2 -h=16 -h_k=4 -d=128 \
+           -s=256,129 -s_k=256,129 -s_kpad=256 \
+           -bias=a -mask=t -lse=1 -iperm=0 -operm=0 -vlayout=r \
+           -kname=$KNAME $COMMON_ARGS
+    done
+}
+
+set -x
+
+run_fp16_bf16_tests
+run_padding_smoke_tests
+run_padding_basic_boundary_tests
+run_fp8bf16_tests
+run_fp8fp32_tests
+
+if [ $TEST_APPENDKV -eq 1 ] ; then
+    run_fp16_appendkv_tests
+fi
+
+set +x
+
+new_fails_count=0
+known_fails_count=0
+if [ -f $KNOWN_FAILS_FILE ] ; then
+    echo "Comparing current fails ($CURR_FAILS_FILE) against known fails ($KNOWN_FAILS_FILE):"
+    while IFS= read -r line; do
+        if grep -Fxq "$line" $KNOWN_FAILS_FILE; then
+            echo "Known fail: $line"
+            known_fails_count=$(($known_fails_count + 1))
+        else
+            echo "New fail: $line"
+            new_fails_count=$(($new_fails_count + 1))
+        fi
+    done < $CURR_FAILS_FILE
+else
+    new_fails_count=$(wc -l < $CURR_FAILS_FILE)
+    echo "No known fails file, all fails ($new_fails_count) are new:"
+    cat $CURR_FAILS_FILE
+fi
+echo "New fails count: $new_fails_count; Known fails count: $known_fails_count"
+exit $(($new_fails_count != 0))
--- a/example/ck_tile/01_fmha/script/smoke_test_fwd_sink.sh
+++ b/example/ck_tile/01_fmha/script/smoke_test_fwd_sink.sh
@@ -0,0 +1,93 @@
+#!/bin/bash
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+# TODO: run this script from CK root or build directory
+#EXE="/code/composable_kernel/build/bin/tile_example_fmha_fwd"
+set -euo pipefail
+
+SCRIPT_DIR=$(cd $(dirname "${BASH_SOURCE[0]}") && pwd)
+EXE_NAME=tile_example_fmha_fwd
+EXE="$(find . -name $EXE_NAME -type f | head -n 1)"
+KNAME=1
+GPU_arch=$GPU_arch
+if [ -z "$GPU_arch" ] ; then
+    GPU_arch=$(rocminfo | grep -E 'Name:\s+gfx' | head -n1 | awk '{print $2}')
+fi
+set -x
+
+COMMON_ARGS='-v=1 -warmup=0 -repeat=1'
+
+
+$EXE -prec=fp16 -mode=0 -b=1 -h=1 -d=128 -d_v=128 -s=512 -s_k=512 -bias=n -lse=0 -iperm=0 -operm=0 -vlayout=r -num_splits=1 -page_block_size=128 -cache_batch_idx=0  -kname=1 -v=1 -warmup=0 -repeat=1 -mask=t:2,0,2
+
+# window_size[2,0], sink_size = 2
+
+#    x=1/y=3                 
+#    1 * * * * * * *           1 * * * * * * *  
+#    1 1 * * * * * *           1 1 * * * * * *
+#    1 1 1 * * * * *   ---->   1 1 1 * * * * * 
+#    * 1 1 1 * * * *           1 1 1 1 * * * * 
+#    * * 1 1 1 * * *           1 1 1 1 1 * * * 
+#    * * * 1 1 1 * *           1 1 * 1 1 1 * * 
+#    * * * * 1 1 1 *           1 1 * * 1 1 1 *
+#    * * * * * 1 1 1           1 1 * * * 1 1 1
+#    l=2/r=0(tl)               l=2/r=0/s=2(tl)
+
+$EXE -prec=fp16 -mode=0 -b=1 -h=1 -d=128 -d_v=128 -s=1024 -s_k=1024 -bias=n -lse=0 -iperm=0 -operm=0 -vlayout=r -num_splits=1 -page_block_size=128 -cache_batch_idx=0  -kname=1 -v=1 -warmup=0 -repeat=1 -mask=t:0,3,2 #-mask=b:3,0,2
+
+#    x=4/y=1                   
+#    1 1 1 1 * * * *           1 1 1 1 * * * * 
+#    * 1 1 1 1 * * *           1 1 1 1 1 * * *
+#    * * 1 1 1 1 * *   ---->   1 1 1 1 1 1 * *
+#    * * * 1 1 1 1 *           1 1 * 1 1 1 1 *
+#    * * * * 1 1 1 1           1 1 * * 1 1 1 1 
+#    l=0/r=3(tl)               l=0/r=3/s=2(tl)
+#    l=3/r=0(br)               l=3/r=0/s=2(br)  
+
+
+$EXE -prec=fp16 -mode=0 -b=1 -h=1 -d=128 -d_v=128 -s=4096 -s_k=4096 -bias=n -lse=0 -iperm=0 -operm=0 -vlayout=r -num_splits=1 -page_block_size=128 -cache_batch_idx=0  -kname=1 -v=1 -warmup=0 -repeat=1 -mask=b:1,0,2
+
+#    x=4/y=-1          
+#    * * 1 1 * * * *           1 1 1 1 * * * * 
+#    * * * 1 1 * * *           1 1 * 1 1 * * *
+#    * * * * 1 1 * *   ---->   1 1 * * 1 1 * *
+#    * * * * * 1 1 *           1 1 * * * 1 1 *
+#    * * * * * * 1 1           1 1 * * * * 1 1 
+#    l=1/r=0(br)               l=1/r=0/s=2(br)
+
+
+$EXE -prec=fp16 -mode=1 -b=1 -h=1 -d=128 -d_v=128 -s=8192 -s_k=8192 -bias=n -lse=0 -iperm=0 -operm=0 -vlayout=r -num_splits=1 -page_block_size=128 -cache_batch_idx=0  -kname=1 -v=1 -warmup=0 -repeat=1 -mask=b:2,0,2
+
+#    x=-1/y=5 
+     
+#    * * * * * *               * * * * * *
+#    * * * * * *               * * * * * *
+#    1 * * * * *               1 * * * * *
+#    1 1 * * * *               1 1 * * * *
+#    1 1 1 * * *       ---->   1 1 1 * * *
+#    * 1 1 1 * *               1 1 1 1 * *
+#    * * 1 1 1 *               1 1 1 1 1 *  
+#    * * * 1 1 1               1 1 * 1 1 1
+#    l=2/r=0(br)               l=2/r=0/s=2(br)
+
+
+$EXE -prec=fp16 -mode=1 -b=1 -h=1 -d=128 -d_v=128 -s=16384 -s_k=16384 -bias=n -lse=0 -iperm=0 -operm=0 -vlayout=r -num_splits=1 -page_block_size=128 -cache_batch_idx=0  -kname=1 -v=1 -warmup=0 -repeat=1 -mask=b:-1,1,2
+#      x=-1/y=8
+#    * * * * *               * * * * *    
+#    * * * * *               * * * * * 
+#    1 * * * *      ---->    1 * * * * 
+#    1 1 * * *               1 1 * * * 
+#    1 1 1 * *               1 1 1 * * 
+#    1 1 1 1 *               1 1 1 1 * 
+#    1 1 1 1 1               1 1 1 1 1 
+#    1 1 1 1 1               1 1 1 1 1 
+#    l=2/r=0(br)             l=2/r=0/s=2(br)
+     
+$EXE -prec=fp16 -mode=0 -b=1 -h=1 -d=128 -d_v=128 -s=512 -s_k=512 -bias=n -lse=0 -iperm=0 -operm=0 -vlayout=r -kname=1 -v=1 -warmup=0 -repeat=1 -init_sink=1 -mask=1
+
+$EXE -prec=fp16 -mode=0 -b=1 -h=1 -d=128 -d_v=128 -s=1024 -s_k=1024 -bias=n -lse=0 -iperm=0 -operm=0 -vlayout=r -kname=1 -v=1 -warmup=0 -repeat=1 -init_sink=1 -mask=0
+
+$EXE -prec=fp16 -mode=0 -b=1 -h=1 -d=128 -d_v=128 -s=4096 -s_k=4096 -bias=n -lse=0 -iperm=0 -operm=0 -vlayout=r -page_block_size=128 -cache_batch_idx=0  -kname=1 -v=1 -warmup=0 -repeat=1 -init_sink=1
+
+$EXE -prec=fp16 -mode=1 -b=1 -h=1 -d=128 -d_v=128 -s=8192 -s_k=8192 -bias=n -lse=0 -iperm=0 -operm=0 -vlayout=r -page_block_size=128 -cache_batch_idx=0  -kname=1 -v=1 -warmup=0 -repeat=1 -init_sink=1 -mask=1
--- a/example/ck_tile/01_fmha/utils.hpp
+++ b/example/ck_tile/01_fmha/utils.hpp
@@ -0,0 +1,254 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <algorithm>
+#include <cstdint>
+#include <functional>
+#include <optional>
+#include <ostream>
+#include <sstream>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "ck_tile/core/container/span.hpp"
+
+enum class mode_enum
+{
+    batch = 0,
+    group
+};
+
+std::ostream& operator<<(std::ostream& stream, mode_enum mode)
+{
+    return stream << (mode == mode_enum::batch ? "batch" : "group");
+}
+
+template <typename T>
+std::ostream& operator<<(std::ostream& os, const std::vector<T>& v)
+{
+    using size_type = typename std::vector<T>::size_type;
+
+    os << "[";
+    for(size_type idx = 0; idx < v.size(); ++idx)
+    {
+        if(0 < idx)
+        {
+            os << ", ";
+        }
+        os << v[idx];
+    }
+    return os << "]";
+}
+
+std::vector<int32_t> to_seqstarts(ck_tile::span<const int32_t> seqlens)
+{
+    std::vector<int32_t> seqstarts = {0};
+    for(int32_t seqlen : seqlens)
+    {
+        seqstarts.push_back(seqstarts.back() + seqlen);
+    }
+    assert(seqstarts.size() == seqlens.size() + 1);
+    return seqstarts;
+}
+
+template <typename RandomEngine>
+std::vector<int32_t> generate_seqlens(mode_enum mode,
+                                      unsigned count,
+                                      int32_t seqlen_avg,
+                                      int32_t seqlen_min, // if not negative, clamp min
+                                      int32_t seqlen_max, // if not negative, clamp max
+                                      RandomEngine& random_engine)
+{
+    assert(0 < count);
+
+    seqlen_min = (0 < seqlen_min ? seqlen_min : 1);
+    seqlen_max = (0 < seqlen_max ? seqlen_max : std::numeric_limits<int32_t>::max());
+    assert(seqlen_min <= seqlen_max);
+
+    std::vector<int32_t> seqlens(count, std::clamp(seqlen_avg, seqlen_min, seqlen_max));
+
+    if(mode == mode_enum::group && 1 < count)
+    {
+        using size_type = std::vector<int32_t>::size_type;
+
+        std::uniform_int_distribution<size_type> idx_dist(0, count - 1);
+        auto next_idx = std::bind(idx_dist, std::ref(random_engine));
+
+        std::uniform_int_distribution<size_type> step_dist(1, count - 1);
+        auto next_step = std::bind(step_dist, std::ref(random_engine));
+
+        for(unsigned repeat = seqlen_avg * (count / 2); 0 < repeat; --repeat)
+        {
+            const size_type to_decrease = next_idx();
+            // make sure each elements of seqlens is in range [seqlen_min, seqlen_max]
+            if(seqlens[to_decrease] == seqlen_min)
+            {
+                continue;
+            }
+
+            const size_type to_increase = (to_decrease + next_step()) % count;
+
+            if(seqlens[to_increase] >= seqlen_max)
+            {
+                continue;
+            }
+
+            --seqlens[to_decrease];
+            ++seqlens[to_increase];
+        }
+    }
+
+    return seqlens;
+}
+
+// return random integer generated uniformly in range [low, high]
+template <typename Int = int, typename RandomEngine>
+auto randint(Int low,
+             Int high,
+             RandomEngine& random_engine) -> std::enable_if_t<std::is_integral_v<Int>, Int>
+{
+    std::uniform_int_distribution<Int> dist(low, high);
+    return dist(random_engine);
+}
+
+// return random integers generated uniformly in range [low, high]
+template <typename Int, typename ForwardIterator, typename RandomEngine>
+auto randints(ForwardIterator first,
+              ForwardIterator last,
+              Int low,
+              Int high,
+              RandomEngine& random_engine) -> std::enable_if_t<std::is_integral_v<Int>>
+{
+    std::uniform_int_distribution<Int> dist(low, high);
+
+    std::generate(first, last, [&] { return dist(random_engine); });
+}
+
+/*
+ * generate missing values in *_val randomly when the number of values is smaller than batch
+ * example (assume batch=3)
+ *   q_val=1,2,3 k_val=4,5,6 -> OK
+ *   q_val=1,2,3             -> OK, k same as q
+ *   q_val=1,2               -> OK, q will rand remaining 1 element, k same as q
+ *   q_val=1,2   k_val=4,5   -> OK, q/k will rand remaining 1 element
+ *   q_val=1,2,3,4           -> OK, but ignore exceed one
+ *
+ *   q_val=1,2   k_val=4,5,6 -> not OK, k must have same splits with q
+ *   q_val=1,2   k_val=4     -> not OK, k must have same splits with q
+ */
+template <typename RandomEngine>
+std::tuple<std::vector<ck_tile::index_t>,
+           std::vector<ck_tile::index_t>,
+           std::vector<ck_tile::index_t>,
+           std::vector<ck_tile::index_t>>
+generate_missing_seqlens(mode_enum mode,
+                         ck_tile::index_t batch,
+                         const std::vector<ck_tile::index_t>& q_val,
+                         const std::vector<ck_tile::index_t>& k_val,
+                         const std::vector<ck_tile::index_t>& q_pad_val,
+                         const std::vector<ck_tile::index_t>& k_pad_val,
+                         ck_tile::index_t seqlen_k_min,
+                         bool need_append_kvcache,
+                         RandomEngine& random_engine)
+{
+    if(mode == mode_enum::batch)
+    {
+        ck_tile::index_t q = q_val[0];
+        ck_tile::index_t k = k_val[0];
+
+        auto s_q = std::vector<ck_tile::index_t>(batch, q);
+        auto s_k = [&] {
+            const ck_tile::index_t seqlen_k_max = (k < 0 ? q : k);
+            std::vector<ck_tile::index_t> seqlen_ks(batch, seqlen_k_max);
+
+            if(1 < batch && need_append_kvcache)
+            {
+                // to keep the original s_k value, we always use seqlen_k_max in first batch
+                randints(std::next(seqlen_ks.begin()),
+                         seqlen_ks.end(),
+                         seqlen_k_min,
+                         seqlen_k_max,
+                         random_engine);
+                return seqlen_ks;
+            }
+
+            return seqlen_ks;
+        }();
+        auto s_kpad = std::vector<ck_tile::index_t>(batch, -1); // TODO: batch not support k_padding
+        auto s_qpad = std::vector<ck_tile::index_t>(batch, -1);
+        // s_k should be greater than or equal to seqlen_k_min if provided
+        if(s_k.back() < seqlen_k_min)
+        {
+            std::ostringstream msg;
+            msg << __FILE__ << ":" << __LINE__ << ": seqlen_k (=" << s_k.back()
+                << ") is less than minimum seqlen_k (=" << seqlen_k_min << ")";
+            throw std::runtime_error(msg.str());
+        }
+
+        return std::make_tuple(s_q, s_k, s_qpad, s_kpad);
+    }
+    else
+    {
+        std::vector<ck_tile::index_t> s_q;
+        std::vector<ck_tile::index_t> s_k;
+        std::vector<ck_tile::index_t> s_kpad;
+        std::vector<ck_tile::index_t> s_qpad;
+        ck_tile::index_t idx = 0;
+        for(; idx < std::min(static_cast<ck_tile::index_t>(q_val.size()), batch); ++idx)
+        {
+            ck_tile::index_t q = q_val[idx];
+            ck_tile::index_t k =
+                k_val[std::min(idx, static_cast<ck_tile::index_t>(k_val.size()) - 1)];
+            ck_tile::index_t kp =
+                k_pad_val.empty()
+                    ? -1
+                    : k_pad_val[std::min(idx, static_cast<ck_tile::index_t>(k_pad_val.size()) - 1)];
+
+            ck_tile::index_t qp =
+                q_pad_val.empty()
+                    ? -1
+                    : q_pad_val[std::min(idx, static_cast<ck_tile::index_t>(q_pad_val.size()) - 1)];
+
+            s_q.push_back(q);
+            s_k.push_back(k < 0 ? q : k);
+            s_kpad.push_back(kp);
+            s_qpad.push_back(qp);
+
+            // s_k should be greater than or equal to seqlen_k_min
+            if(s_k.back() < seqlen_k_min)
+            {
+                std::ostringstream msg;
+                msg << __FILE__ << ":" << __LINE__ << ": seqlen_k (=" << s_k.back()
+                    << ") is less than minimum seqlen_k (=" << seqlen_k_min << ")";
+                throw std::runtime_error(msg.str());
+            }
+        }
+        if(idx < batch)
+        {
+            auto rem_q =
+                generate_seqlens(mode, batch - idx, s_q.back(), 1, s_q.back(), random_engine);
+            auto rem_k = generate_seqlens(
+                mode, batch - idx, s_k.back(), seqlen_k_min, s_kpad.back(), random_engine);
+
+            s_q.insert(s_q.end(), rem_q.begin(), rem_q.end());
+            s_k.insert(s_k.end(), rem_k.begin(), rem_k.end());
+            s_kpad.insert(s_kpad.end(), batch - idx, s_kpad.back());
+            s_qpad.insert(s_qpad.end(), batch - idx, s_qpad.back());
+        }
+        return std::make_tuple(s_q, s_k, s_qpad, s_kpad);
+    }
+}
+
+template <typename RandomAccessIterator, typename Int, typename RandomEngine>
+std::enable_if_t<std::is_integral_v<Int>> iota_shuffle(RandomAccessIterator first,
+                                                       RandomAccessIterator last,
+                                                       Int value,
+                                                       RandomEngine& random_engine)
+{
+    std::iota(first, last, value);
+    std::shuffle(first, last, random_engine);
+}
--- a/example/ck_tile/02_layernorm2d/CMakeLists.txt
+++ b/example/ck_tile/02_layernorm2d/CMakeLists.txt
@@ -0,0 +1,47 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+set(LAYERNORM2D_FWD_KNOWN_APIS "fwd;bwd")
+set(LAYERNORM2D_FWD_ENABLE_APIS  "fwd" CACHE STRING
+    "semicolon-separated list of APIs to generate (${LAYERNORM2D_FWD_KNOWN_APIS}) & link, or \"all\".")
+if(LAYERNORM2D_FWD_ENABLE_APIS  STREQUAL "all")
+  set(LAYERNORM2D_FWD_ENABLE_APIS  ${LAYERNORM2D_FWD_KNOWN_APIS})
+endif()
+
+# generate a list of kernels, but not actually emit files at config sta
+execute_process(
+  COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/generate.py
+  --api ${LAYERNORM2D_FWD_ENABLE_APIS} --working_path ${CMAKE_CURRENT_BINARY_DIR} --list_blobs
+  RESULT_VARIABLE ret
+)
+if(ret AND NOT ret EQUAL 0)
+  message( FATAL_ERROR "Fail to generate kernels via Python. ${ret}")
+endif()
+
+file(STRINGS ${CMAKE_CURRENT_BINARY_DIR}/layernorm2d_fwd_blobs.txt LAYERNORM2D_FWD_GEN_BLOBS)
+
+add_custom_command(
+  OUTPUT ${LAYERNORM2D_FWD_GEN_BLOBS}
+  COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/generate.py
+  --api ${LAYERNORM2D_FWD_ENABLE_APIS} --working_path ${CMAKE_CURRENT_BINARY_DIR} --gen_blobs
+)
+
+set(EXAMPLE_LAYERNORM2D_FWD "tile_example_layernorm2d_fwd")
+
+message(DEBUG "adding example ${EXAMPLE_LAYERNORM2D_FWD}")
+add_executable(${EXAMPLE_LAYERNORM2D_FWD} layernorm2d_fwd.cpp)
+target_include_directories(${EXAMPLE_LAYERNORM2D_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
+target_sources(${EXAMPLE_LAYERNORM2D_FWD} PRIVATE ${LAYERNORM2D_FWD_GEN_BLOBS})
+
+set(EXAMPLE_LAYERNORM2D_FWD_COMPILE_OPTIONS)
+
+# NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
+list(APPEND EXAMPLE_LAYERNORM2D_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal --offload-compress)
+
+target_compile_options(${EXAMPLE_LAYERNORM2D_FWD} PRIVATE ${EXAMPLE_LAYERNORM2D_FWD_COMPILE_OPTIONS})
+
+# TODO: we have to turn off this global prop, otherwise the progress bar generated
+# by cmake will print too many files, execvp: /bin/sh: Argument list too long
+# however, this property may affect global
+# TODO: consider codegen a makefile by us
+set_property(GLOBAL PROPERTY RULE_MESSAGES OFF)
--- a/example/ck_tile/02_layernorm2d/README.md
+++ b/example/ck_tile/02_layernorm2d/README.md
@@ -0,0 +1,134 @@
+# LayerNorm2D Forward with CK Tile
+
+This example demonstrates efficient 2D layer normalization using the CK Tile programming model, leveraging tile-based parallelism and advanced fusion for transformer and LLM workloads.
+
+---
+
+## Algorithm and Math
+
+LayerNorm computes, for each row $x$:
+$$
+\mu = \frac{1}{N} \sum_{i=1}^N x_i,\quad \sigma^2 = \frac{1}{N} \sum_{i=1}^N (x_i - \mu)^2
+$$
+$$
+\hat{x}_i = \frac{x_i - \mu}{\sqrt{\sigma^2 + \epsilon}},\quad y_i = \gamma \hat{x}_i + \beta
+$$
+
+- **Welford's Algorithm**: Used for numerically stable, blockwise mean/variance computation. For $N \leq 4096$, a one-pass algorithm is used; for large $N$, a two-pass approach is adopted.
+
+--
+
+## Features
+
+- **Prenorm/Postnorm Fusion**: Fused residual addition before/after normalization for transformer blocks.
+- **Smooth/Dynamic Quantization**: Rowwise int8 quantization with per-token scale, supporting smoothquant for LLMs.
+- **Flexible Precision**: Supports fp16, bf16, int8 output.
+- **Efficient for Large N**: Two-pass pipeline for $N > 4096$.
+- **Highly Modular**: Easily extendable for new fusion or quantization strategies.
+
+---
+
+## Build & Run
+
+```
+# in the root of ck_tile
+mkdir build && cd build
+../script/cmake-ck-dev.sh  ../ <arch>  # you can replace this <arch> to gfx90a, gfx942...
+make tile_example_layernorm2d_fwd -j
+```
+This will result in an executable `build/bin/tile_example_layernorm2d_fwd`
+
+## Example
+```
+args:
+          -m    m dimension (default:3328)
+          -n    n dimension (default:4096)
+     -stride    stride per row, if -1 then equal to n (default:-1)
+          -e    epsilon (default:1e-5)
+    -save_mv    save mean/variance(invstd) or not. set to 1 in training case (default:0)
+          -v    cpu validation or not (default:1)
+      -kname    print kernel name or not (default:1)
+     -prec_i    input precision (default:fp16)
+     -prec_o    output precision, set auto will be the same as input (default:auto)
+    -prec_sm    output quant scale type, set auto will be the same as input. used when fquant=1 (default:auto)
+    -prec_sy    output quant scale type, set auto will be the same as input. used when fquant=1 or 2 (default:auto)
+       -fadd    fused-add, 0:no fused add, 1:preadd+store, 2:preadd only (default:0)
+     -fquant    fused-quant, 0:no, 1:smooth-dynamic-quant, 2:dynamic-quant (default:0)
+     -warmup    cold iter (default:5)
+     -repeat    hot iter (default:20)
+       -json    0: No Json, 1: Dump Results in Json format (default:0)
+   -jsonfile    json file name to dump results (default:layernorm2d_fwd.json)
+
+```
+---
+
+## Technical Details
+
+## Welford online algorithm
+We use welfold algorithm to update `mean`/`variance` block by block. For `N <=4096` case we can compute `mean`/`var`/`normalization` within one loop, we call it `one-pass`. For large N case, it is hard to keep `mean`/`var` inside register/LDS and then computation `normalization`, so we need to load input twice, first time to compute `mean`/`var` block-by-block, then load input another time to compute the `normalization`. We call it `two-pass`.
+
+## mean/variance save
+In training case the mean/variance need to store out (TBD, not supported yet).
+
+## prenorm/postnorm
+
+![](misc/pnorm.png)
+
+Since [prenorm/postnorm](https://arxiv.org/pdf/1906.01787) is quite common in LLM blocks, this example boosts this feature by kernel fusion. Note that `prenorm`/`postnorm` always need to do elementwise-add a `shortcut` before the actual layernorm computation, and optionally store out the result to global. You can use `-fadd=1` to test `pre-add+store`, or `-fadd=2` to test `pre-add` without store out (not codegen by default).
+
+## smooth-quant/dynamic-quant
+We support smooth/dynamic quantization for `int8` output, by setting `-fquant=1` and `-prec_o=int8`. In this case the output will doing a rowwise dynamic quantization like below. Note that smooth-quant require input a `(1*N)` size per-channel scale(in fp32 in our example, though this is customizable), then elememt-wise multiply the tensor for each row, then compute the rowwise dynamic quant. if set `-fquant=2` will have the input per-channel scale stage, only the dynamic quant. This case is supported in our kernel but by default not generated (TBD: add some filter in generate.py support on-demand codegen)
+![](misc/dquant.png)
+
+```
+# assume output int8, hidden_states is [m, n] shape and in fp16/bf16
+# [m, 1]
+per_token_amax, _ = torch.max(
+     input=torch.abs(hidden_states), 
+     dim=-1, 
+     keepdim=True
+)
+per_token_scale = per_token_amax.to(dtype=torch.float32) / 127.0
+
+# quant hidden_states
+hidden_states = (hidden_states / per_token_scale).to(dtype=torch.int8)
+
+return hidden_states, per_token_scale
+# hidden_states now is int8 will feed to next layer as intput
+# per_token_scale will be used as dequant factor later layer
+```
+## limitations
+Note that `fquant=2`, `fadd=2`, `prec_sm/prec_sy` other than `fp32` are not by default generated. Though our kernel template suppor this. (TBD: add some flag in generate.py) to generate those instance on demand. Beside, `N>8192` case will by default using two-pass pipeline, and `-fquant=1/2` are not supported yet. If need suport `N>8192` and `fused+residual+store`, you can use this example together with `12_smoothquant`, to construct layernorm+residual, and smoothquant, 2 kernels for this purpose.
+
+```
+# some case
+# standard fp16 layernorm 2d, m=10. n=1024
+./build/bin/tile_example_layernorm2d_fwd  -m=10 -n=1024
+
+# standard fp16 layernorm 2d, m=10. n=1024, fused-smooth-quant, output in int8
+./build/bin/tile_example_layernorm2d_fwd  -m=10 -n=1024 -prec_o=int8 -fquant=1
+
+# standard fp16 layernorm 2d, m=10. n=1024, fused-smooth-quant+fused-add-store, output in int8
+./build/bin/tile_example_layernorm2d_fwd  -m=10 -n=1024 -prec_o=int8 -fquant=1 -fadd=1
+```
+---
+
+## Source Structure
+
+- **Kernel**: `layernorm2d_fwd.hpp` (tile-programming kernel template)
+- **Executable**: `layernorm2d_fwd.cpp` (argument parsing, kernel launch)
+- **Codegen**: `generate.py` (instantiates kernels for different configs)
+- **Misc**: `misc/` (algorithm diagrams, e.g., prenorm/postnorm, quantization)
+
+---
+
+## Related CK Tile Examples
+
+- [01_fmha](../01_fmha/README.md): Fused multi-head attention (FMHA)
+- [03_gemm](../03_gemm/README.md): Tile-programming GEMM
+- [12_smoothquant](../12_smoothquant/README.md): Standalone smoothquant kernel
+
+For and distribution, see `include/ck_tile/tile_program/tile_distribution/`.
+
+---
+[Back to CK Tile Examples](../README.md)
--- a/example/ck_tile/02_layernorm2d/generate.py
+++ b/example/ck_tile/02_layernorm2d/generate.py
--- a/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp
+++ b/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp
@@ -0,0 +1,504 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "ck_tile/host.hpp"
+#include "layernorm2d_fwd.hpp"
+#include "ck_tile/utility/json_dump.hpp"
+#include <algorithm>
+#include <cstring>
+
+// different threshold for different dtype
+template <typename DataType>
+auto get_elimit()
+{
+    double rtol = 1e-2;
+    double atol = 1e-2;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<ck_tile::bf16_t>()
+{
+    double rtol = 1e-2;
+    double atol = 1e-2;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<ck_tile::int8_t>()
+{
+    double rtol = 1e-2;
+    double atol = 1.0;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("m", "3328", "m dimension")
+        .insert("n", "4096", "n dimension")
+        .insert("x_stride", "-1", "x row_stride, if -1 then equal to n")
+        .insert("xr_stride", "-1", "x residule row_stride, if -1 then equal to n")
+        .insert("y_stride", "-1", "y row_stride, if -1 then equal to n")
+        .insert("yr_stride", "-1", "y residule row_stride, if -1 then equal to n")
+        .insert("e", "1e-5", "epsilon")
+        .insert("save_mv", "0", "save mean/variance(invstd) or not. set to 1 in training case")
+        .insert("v", "1", "cpu validation or not")
+        .insert("kname", "1", "print kernel name or not")
+        .insert("prec_i", "fp16", "input precision")
+        .insert("prec_o", "auto", "output precision, set auto will be the same as input")
+        .insert("prec_sm",
+                "auto",
+                "output quant scale type, set auto will use fp32. used when fquant=1")
+        .insert("prec_sy",
+                "auto",
+                "output quant scale type, set auto will use fp32. used when fquant=1 or 2")
+        .insert("xbias", "0", "add bias, 0:no add, 1:add bias before fadd")
+        .insert("fadd", "0", "fused-add, 0:no fused add, 1:preadd+store, 2:preadd only")
+        .insert("fquant", "0", "fused-quant, 0:no, 1:smooth-dynamic-quant, 2:dynamic-quant")
+        .insert("warmup", "5", "cold iter")
+        .insert("repeat", "20", "hot iter")
+        .insert("json", "0", "0: No Json, 1: Dump Results in Json format")
+        .insert("jsonfile", "layernorm2d_fwd.json", "json file name to dump results");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+template <typename InDataType,
+          typename OutDataType,
+          typename SmoothScaleDataType,
+          typename YScaleDataType,
+          bool SaveMeanVar>
+bool run(const ck_tile::ArgParser& arg_parser)
+{
+    ck_tile::index_t m        = arg_parser.get_int("m");
+    ck_tile::index_t n        = arg_parser.get_int("n");
+    ck_tile::index_t x_stride = arg_parser.get_int("x_stride");
+    if(x_stride < 0)
+        x_stride = n;
+    ck_tile::index_t xr_stride = arg_parser.get_int("xr_stride");
+    if(xr_stride < 0)
+        xr_stride = n;
+    ck_tile::index_t y_stride = arg_parser.get_int("y_stride");
+    if(y_stride < 0)
+        y_stride = n;
+    ck_tile::index_t yr_stride = arg_parser.get_int("yr_stride");
+    if(yr_stride < 0)
+        yr_stride = n;
+    float epsilon       = arg_parser.get_float("e");
+    std::string prec_i  = arg_parser.get_str("prec_i");
+    std::string prec_o  = arg_parser.get_str("prec_o");
+    std::string prec_sm = arg_parser.get_str("prec_sm");
+    std::string prec_sy = arg_parser.get_str("prec_sy");
+    if(prec_o == "auto")
+    {
+        prec_o = prec_i;
+    }
+    if(prec_sm == "auto")
+    {
+        prec_sm = "fp32";
+    }
+    if(prec_sy == "auto")
+    {
+        prec_sy = "fp32";
+    }
+
+    int kname         = arg_parser.get_int("kname");
+    int do_validation = arg_parser.get_int("v");
+    int warmup        = arg_parser.get_int("warmup");
+    int repeat        = arg_parser.get_int("repeat");
+    int xbias         = arg_parser.get_int("xbias");
+    int fused_add     = arg_parser.get_int("fadd");
+    int fused_quant   = arg_parser.get_int("fquant");
+    if(fused_quant == 1 && prec_o != "int8" && prec_o != "fp8")
+    {
+        std::cout
+            << "if fused_quant is 1 or 2, only support \"-prec_o=int8\" or \"-prec_o=fp8\" cases."
+            << std::endl;
+        return false;
+    }
+
+    assert(x_stride >= n);
+
+    using TypeConfig =
+        LayerNormTypeConfig<InDataType, OutDataType, SmoothScaleDataType, YScaleDataType>;
+
+    using XDataType         = typename TypeConfig::XDataType;
+    using YDataType         = typename TypeConfig::YDataType;
+    using XBiasDataType     = typename TypeConfig::XBiasDataType;
+    using GammaDataType     = typename TypeConfig::GammaDataType;
+    using BetaDataType      = typename TypeConfig::BetaDataType;
+    using XResidualDataType = XDataType;
+    using YResidualDataType = XDataType;
+
+    using MeanDataType =
+        std::conditional_t<SaveMeanVar, typename TypeConfig::MeanDataType, ck_tile::null_type>;
+    using InvStdDataType =
+        std::conditional_t<SaveMeanVar, typename TypeConfig::InvStdDataType, ck_tile::null_type>;
+
+    using ComputeDataType = typename TypeConfig::ComputeDataType;
+
+    // host verify
+    ck_tile::HostTensor<XDataType> x_host({m, n}, {x_stride, 1});
+    ck_tile::HostTensor<XBiasDataType> x_bias_host({n});
+    ck_tile::HostTensor<GammaDataType> gamma_host({n});
+    ck_tile::HostTensor<BetaDataType> beta_host({n});
+
+    ck_tile::HostTensor<XResidualDataType> x_residual_host({m, n}, {xr_stride, 1});
+    ck_tile::HostTensor<YResidualDataType> y_residual_host({m, n}, {yr_stride, 1});
+
+    ck_tile::HostTensor<YDataType> y_host_ref({m, n}, {y_stride, 1});
+    ck_tile::HostTensor<YDataType> y_host_dev({m, n}, {y_stride, 1});
+
+    ck_tile::HostTensor<MeanDataType> mean_host_ref({m});
+    ck_tile::HostTensor<InvStdDataType> invStd_host_ref({m});
+    ck_tile::HostTensor<YScaleDataType> y_scale_host_ref({m});
+    ck_tile::HostTensor<YScaleDataType> y_scale_host_dev({m});
+
+    ck_tile::HostTensor<SmoothScaleDataType> sm_scale_host({n});
+    ck_tile::HostTensor<SmoothScaleDataType> sm_scale_host_dev({n});
+
+    ck_tile::FillUniformDistribution<XDataType>{-.5f, .5f}(x_host);
+    ck_tile::FillUniformDistribution<XResidualDataType>{-.5f, .5f}(x_residual_host);
+    ck_tile::FillUniformDistribution<SmoothScaleDataType>{-1.f, 1.f}(sm_scale_host);
+    ck_tile::FillUniformDistribution<XBiasDataType>{-.5f, .5f}(x_bias_host);
+    ck_tile::FillUniformDistribution<GammaDataType>{-.5f, .5f}(gamma_host);
+    ck_tile::FillUniformDistribution<BetaDataType>{-.5f, .5f}(beta_host);
+
+    ck_tile::DeviceMem x_buf(x_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem x_bias_buf(x_bias_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem gamma_buf(gamma_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem beta_buf(beta_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem y_buf(y_host_dev.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem y_scale_buf(y_scale_host_dev.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem sm_scale_buf(sm_scale_host_dev.get_element_space_size_in_bytes());
+
+    ck_tile::DeviceMem x_residual_buf(x_residual_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem y_residual_buf(y_residual_host.get_element_space_size_in_bytes());
+
+    x_buf.ToDevice(x_host.data());
+    x_bias_buf.ToDevice(x_bias_host.data());
+    gamma_buf.ToDevice(gamma_host.data());
+    beta_buf.ToDevice(beta_host.data());
+    x_residual_buf.ToDevice(x_residual_host.data());
+    sm_scale_buf.ToDevice(sm_scale_host.data());
+
+    auto prec_str = [&]() {
+        auto base_str = prec_i;
+        if(prec_i != prec_o)
+        {
+            base_str += "|" + prec_o;
+        }
+        if(fused_quant == 1)
+        {
+            base_str += std::string("(") + prec_sy + ")";
+        }
+        return base_str;
+    }();
+
+    std::cout << "[" << prec_str << "]" << " m:" << m << ", n:" << n << ", x_stride:" << x_stride
+              << ", xr_stride:" << xr_stride << ", y_stride:" << y_stride
+              << ", yr_stride:" << yr_stride << std::flush;
+
+    layernorm2d_fwd_traits traits{
+        prec_i, prec_o, prec_sm, prec_sy, SaveMeanVar, xbias, fused_add, fused_quant};
+
+    layernorm2d_fwd_args args{x_buf.GetDeviceBuffer(),
+                              fused_add != 0 ? x_residual_buf.GetDeviceBuffer() : nullptr,
+                              fused_quant == 1 ? sm_scale_buf.GetDeviceBuffer() : nullptr,
+                              x_bias_buf.GetDeviceBuffer(),
+                              gamma_buf.GetDeviceBuffer(),
+                              beta_buf.GetDeviceBuffer(),
+
+                              y_buf.GetDeviceBuffer(),
+                              fused_add == 1 ? y_residual_buf.GetDeviceBuffer() : nullptr,
+                              fused_quant != 0 ? y_scale_buf.GetDeviceBuffer() : nullptr,
+                              nullptr, // p_mean, unsupported yet
+                              nullptr, // p_invStd, unsupported yet
+
+                              epsilon,
+                              m,
+                              n,
+                              x_stride,   // x row_stride
+                              xr_stride,  // x residule row stride
+                              y_stride,   // y row stride
+                              yr_stride}; // y residule row stride
+
+    float ave_time = layernorm2d_fwd(
+        traits, args, ck_tile::stream_config{nullptr, true, kname ? 1 : 0, warmup, repeat});
+
+    if(ave_time < 0)
+    {
+        std::cout << " not supported!" << std::endl << std::flush;
+        return false;
+    }
+
+    std::size_t num_byte = sizeof(XDataType) * m * n + sizeof(XBiasDataType) * n +
+                           sizeof(GammaDataType) * n + sizeof(BetaDataType) * n +
+                           sizeof(YDataType) * m * n;
+
+    float gb_per_sec = num_byte / 1.E6 / ave_time;
+    std::cout << ", " << ave_time * 1.E3 << " us, " << gb_per_sec << " GB/s" << std::flush;
+
+    bool pass = true;
+
+    if(do_validation)
+    {
+        // reference
+        if(xbias != 0)
+        {
+            // add bias before fadd
+            int M = x_host.mDesc.get_lengths()[0];
+            int N = x_host.mDesc.get_lengths()[1];
+            for(int idx_m = 0; idx_m < M; ++idx_m)
+            {
+                for(int idx_n = 0; idx_n < N; ++idx_n)
+                {
+                    x_host(idx_m, idx_n) = ck_tile::type_convert<XDataType>(
+                        ck_tile::type_convert<ComputeDataType>(x_host(idx_m, idx_n)) +
+                        ck_tile::type_convert<ComputeDataType>(x_bias_host(idx_n)));
+                }
+            }
+        }
+
+        if(fused_add != 0)
+        {
+            // fused pre_add/pre_add_store
+            // TODO we accumulate directly to x_host for simplcity here...
+
+            std::transform(x_host.mData.cbegin(),
+                           x_host.mData.cend(),
+                           x_residual_host.mData.cbegin(),
+                           x_host.mData.begin(),
+                           [](auto x_, auto r_) {
+                               auto o_ = ck_tile::type_convert<ComputeDataType>(x_) +
+                                         ck_tile::type_convert<ComputeDataType>(r_);
+                               return ck_tile::type_convert<XDataType>(o_);
+                           });
+        }
+        ck_tile::reference_layernorm2d_fwd<XDataType,
+                                           GammaDataType,
+                                           BetaDataType,
+                                           ComputeDataType,
+                                           YDataType,
+                                           MeanDataType,
+                                           InvStdDataType>(
+            x_host, gamma_host, beta_host, y_host_ref, mean_host_ref, invStd_host_ref, epsilon);
+
+        if(fused_quant != 0)
+        {
+            auto dquant_functor = [&](int m_, auto& o_, auto& acc_) {
+                int N_ = acc_.mDesc.get_lengths()[1];
+                if(fused_quant == 1)
+                {
+                    for(int n_ = 0; n_ < N_; n_++)
+                    {
+                        // input smooth outlier
+                        acc_(m_, n_) = acc_(m_, n_) *
+                                       ck_tile::type_convert<ComputeDataType>(sm_scale_host(n_));
+                    }
+                }
+                ComputeDataType absmax = static_cast<ComputeDataType>(0);
+                for(int n_ = 0; n_ < N_; n_++)
+                {
+                    const auto a = ck_tile::abs(acc_(m_, n_));
+                    absmax       = a > absmax ? a : absmax;
+                }
+                // printf("cpu:absmax:%f\n", absmax);
+                constexpr ComputeDataType kMaxY =
+                    std::is_same<YDataType, ck_tile::fp8_t>::value    ? 240.0
+                    : std::is_same<YDataType, ck_tile::int8_t>::value ? 127.0
+                                                                      : 0.0;
+                ComputeDataType y_scale = absmax / kMaxY;
+                y_scale_host_ref(m_)    = ck_tile::type_convert<YScaleDataType>(y_scale);
+                for(int n_ = 0; n_ < N_; n_++)
+                {
+                    o_(m_, n_) = ck_tile::type_convert<YDataType>(acc_(m_, n_) / y_scale);
+                }
+            };
+
+            ck_tile::reference_layernorm2d_fwd<XDataType,
+                                               GammaDataType,
+                                               BetaDataType,
+                                               ComputeDataType,
+                                               YDataType,
+                                               MeanDataType,
+                                               InvStdDataType>(x_host,
+                                                               gamma_host,
+                                                               beta_host,
+                                                               y_host_ref,
+                                                               mean_host_ref,
+                                                               invStd_host_ref,
+                                                               epsilon,
+                                                               dquant_functor);
+        }
+        else
+        {
+            ck_tile::reference_layernorm2d_fwd<XDataType,
+                                               GammaDataType,
+                                               BetaDataType,
+                                               ComputeDataType,
+                                               YDataType,
+                                               MeanDataType,
+                                               InvStdDataType>(
+                x_host, gamma_host, beta_host, y_host_ref, mean_host_ref, invStd_host_ref, epsilon);
+        }
+
+        y_buf.FromDevice(y_host_dev.data());
+
+        ck_tile::HostTensor<YResidualDataType> y_residual_host_dev({m, n}, {yr_stride, 1});
+        if(fused_add == 1)
+        {
+            y_residual_buf.FromDevice(y_residual_host_dev.data());
+        }
+
+        auto [rtol, atol] = get_elimit<OutDataType>();
+
+        if(x_stride == n)
+        {
+            pass = ck_tile::check_err(
+                y_host_dev, y_host_ref, std::string("OUT Error: Incorrect results!"), rtol, atol);
+            if(fused_add == 1)
+            {
+                pass &= ck_tile::check_err(y_residual_host_dev,
+                                           x_host,
+                                           std::string("ADD Error: Incorrect results!"),
+                                           rtol,
+                                           atol);
+            }
+        }
+        else
+        {
+            for(int i_r = 0; i_r < m; i_r++)
+            {
+                std::vector<YDataType> y_host_dev_row(y_host_dev.begin() + i_r * y_stride,
+                                                      y_host_dev.begin() + i_r * y_stride + n);
+                std::vector<YDataType> y_host_ref_row(y_host_ref.begin() + i_r * y_stride,
+                                                      y_host_ref.begin() + i_r * y_stride + n);
+                pass &= ck_tile::check_err(y_host_dev_row,
+                                           y_host_ref_row,
+                                           std::string("OUT[") + std::to_string(i_r) +
+                                               std::string("] Error: Incorrect results!"),
+                                           rtol,
+                                           atol);
+                if(fused_add == 1)
+                {
+                    std::vector<YResidualDataType> y_residual_host_dev_row(
+                        y_residual_host_dev.begin() + i_r * yr_stride,
+                        y_residual_host_dev.begin() + i_r * yr_stride + n);
+                    std::vector<YResidualDataType> y_residual_host_ref_row(
+                        x_host.begin() + i_r * yr_stride, x_host.begin() + i_r * yr_stride + n);
+                    pass &= ck_tile::check_err(y_residual_host_dev_row,
+                                               y_residual_host_ref_row,
+                                               std::string("ADD[") + std::to_string(i_r) +
+                                                   std::string("] Error: Incorrect results!"),
+                                               rtol,
+                                               atol);
+                }
+            }
+        }
+        if(fused_quant == 1)
+        {
+            y_scale_buf.FromDevice(y_scale_host_dev.data());
+            pass &= ck_tile::check_err(y_scale_host_dev,
+                                       y_scale_host_ref,
+                                       std::string("SCALE Error: Incorrect results!"),
+                                       rtol,
+                                       atol);
+        }
+
+        std::cout << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl;
+    }
+
+    if(arg_parser.get_int("json") == 1)
+    {
+        dump_layernorm2d_fwd_json_results(arg_parser.get_str("jsonfile"),
+                                          prec_i,
+                                          prec_o,
+                                          prec_sm,
+                                          prec_sy,
+                                          m,
+                                          n,
+                                          x_stride,
+                                          xr_stride,
+                                          y_stride,
+                                          yr_stride,
+                                          pass,
+                                          ave_time,
+                                          0,
+                                          gb_per_sec);
+    }
+    return pass;
+}
+
+int main(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    std::string prec_i  = arg_parser.get_str("prec_i");
+    std::string prec_o  = arg_parser.get_str("prec_o");
+    std::string prec_sm = arg_parser.get_str("prec_sm");
+    std::string prec_sy = arg_parser.get_str("prec_sy");
+
+    if(prec_o == "auto")
+    {
+        prec_o = prec_i;
+    }
+    if(prec_sm == "auto")
+    {
+        prec_sm = "fp32";
+    }
+    if(prec_sy == "auto")
+    {
+        prec_sy = "fp32";
+    }
+    int save_mv = arg_parser.get_int("save_mv");
+
+    // no dynamic quant case
+    if(prec_i == "fp16" && prec_o == "fp16" && prec_sm == "fp32" && prec_sy == "fp32" && save_mv)
+    {
+        return run<ck_tile::half_t, ck_tile::half_t, float, float, true>(arg_parser) ? 0 : -2;
+    }
+    else if(prec_i == "fp16" && prec_o == "fp16" && prec_sm == "fp32" && prec_sy == "fp32" &&
+            !save_mv)
+    {
+        return run<ck_tile::half_t, ck_tile::half_t, float, float, false>(arg_parser) ? 0 : -2;
+    }
+    else if(prec_i == "bf16" && prec_o == "bf16" && prec_sm == "fp32" && prec_sy == "fp32" &&
+            save_mv)
+    {
+        return run<ck_tile::bf16_t, ck_tile::bf16_t, float, float, true>(arg_parser) ? 0 : -2;
+    }
+    else if(prec_i == "bf16" && prec_o == "bf16" && prec_sm == "fp32" && prec_sy == "fp32" &&
+            !save_mv)
+    {
+        return run<ck_tile::bf16_t, ck_tile::bf16_t, float, float, true>(arg_parser) ? 0 : -2;
+    }
+
+    // dynamic quant case, only in inference
+    else if(prec_i == "fp16" && prec_o == "int8" && prec_sm == "fp32" && prec_sy == "fp32" &&
+            !save_mv)
+    {
+        return run<ck_tile::half_t, ck_tile::int8_t, float, float, false>(arg_parser) ? 0 : -2;
+    }
+    else if(prec_i == "bf16" && prec_o == "int8" && prec_sm == "fp32" && prec_sy == "fp32" &&
+            !save_mv)
+    {
+        return run<ck_tile::bf16_t, ck_tile::int8_t, float, float, false>(arg_parser) ? 0 : -2;
+    }
+    else if(prec_i == "fp16" && prec_o == "fp8" && prec_sm == "fp32" && prec_sy == "fp32" &&
+            !save_mv)
+    {
+        return run<ck_tile::half_t, ck_tile::fp8_t, float, float, false>(arg_parser) ? 0 : -2;
+    }
+    else if(prec_i == "bf16" && prec_o == "fp8" && prec_sm == "fp32" && prec_sy == "fp32" &&
+            !save_mv)
+    {
+        return run<ck_tile::bf16_t, ck_tile::fp8_t, float, float, false>(arg_parser) ? 0 : -2;
+    }
+
+    return -3;
+}
--- a/example/ck_tile/02_layernorm2d/layernorm2d_fwd.hpp
+++ b/example/ck_tile/02_layernorm2d/layernorm2d_fwd.hpp
@@ -0,0 +1,70 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/layernorm2d.hpp"
+#include <string>
+
+template <typename InType,
+          typename OutType,
+          typename SmoothSScaleDataType_,
+          typename YScaleDataType_>
+struct LayerNormTypeConfig;
+
+template <typename OutType, typename SmoothScaleDataType_, typename YScaleDataType_>
+struct LayerNormTypeConfig<ck_tile::half_t, OutType, SmoothScaleDataType_, YScaleDataType_>
+{
+    using XDataType           = ck_tile::half_t;
+    using YDataType           = OutType;
+    using XBiasDataType       = ck_tile::half_t;
+    using GammaDataType       = ck_tile::half_t;
+    using BetaDataType        = ck_tile::half_t;
+    using MeanDataType        = ck_tile::half_t;
+    using InvStdDataType      = ck_tile::half_t;
+    using ComputeDataType     = float;
+    using SmoothScaleDataType = SmoothScaleDataType_;
+    using YScaleDataType      = YScaleDataType_;
+};
+
+template <typename OutType, typename SmoothScaleDataType_, typename YScaleDataType_>
+struct LayerNormTypeConfig<ck_tile::bf16_t, OutType, SmoothScaleDataType_, YScaleDataType_>
+{
+    using XDataType           = ck_tile::bf16_t;
+    using YDataType           = OutType;
+    using XBiasDataType       = ck_tile::bf16_t;
+    using GammaDataType       = ck_tile::bf16_t;
+    using BetaDataType        = ck_tile::bf16_t;
+    using MeanDataType        = ck_tile::bf16_t;
+    using InvStdDataType      = ck_tile::bf16_t;
+    using ComputeDataType     = float;
+    using SmoothScaleDataType = SmoothScaleDataType_;
+    using YScaleDataType      = YScaleDataType_;
+};
+
+// runtime args
+struct layernorm2d_fwd_args : public ck_tile::Layernorm2dFwdHostArgs
+{
+};
+
+// This is the public API, will be generated by script
+struct layernorm2d_fwd_traits
+{
+    std::string prec_i; // input precision
+    std::string prec_o; // output precision
+
+    // if fused_quant == 1, need set prec_sm/prec_sy to proper string, otherwise can set
+    // arbitrary(will skip check) if fused_quant == 2, need set prec_sy to proper string, otherwise
+    // can set arbitrary(will skip check)
+    std::string prec_sm; // x-scale, used for [1*N] input smooth quant
+    std::string prec_sy; // y-scale, used for [M*1] output for next layer
+
+    bool save_mean_var; //
+    int xbias;          // 0:no-bias, 1:add bias
+    int fused_add;      // 0:no-add, 1:pre-add-store, 2:pre-add
+    int fused_quant;    // 0:no-sweep, 1:smooth-dynamic-quant, 2:dynamic-quant
+};
+
+float layernorm2d_fwd(layernorm2d_fwd_traits, layernorm2d_fwd_args, const ck_tile::stream_config&);
--- a/example/ck_tile/02_layernorm2d/misc/dquant.png
+++ b/example/ck_tile/02_layernorm2d/misc/dquant.png
--- a/example/ck_tile/02_layernorm2d/misc/pnorm.png
+++ b/example/ck_tile/02_layernorm2d/misc/pnorm.png
--- a/example/ck_tile/02_layernorm2d/script/perf_test.sh
+++ b/example/ck_tile/02_layernorm2d/script/perf_test.sh
@@ -0,0 +1,40 @@
+#!/bin/sh
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+EXE="$(find . -name tile_example_layernorm2d_fwd -type f | head -n 1)"
+
+$EXE -m=1 -n=1 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000
+$EXE -m=700 -n=80 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000
+$EXE -m=700 -n=128 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000
+$EXE -m=700 -n=144 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000
+$EXE -m=700 -n=168 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000
+$EXE -m=700 -n=184 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000
+$EXE -m=700 -n=256 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000
+$EXE -m=700 -n=288 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000
+$EXE -m=700 -n=344 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000
+$EXE -m=700 -n=376 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000
+$EXE -m=700 -n=448 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000
+$EXE -m=700 -n=512 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000
+$EXE -m=700 -n=924 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000
+$EXE -m=700 -n=1024 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000
+$EXE -m=700 -n=1078 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000
+$EXE -m=700 -n=1996 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000
+$EXE -m=700 -n=4080 -e=1e-12 -v=1 -prec_i=bf16 -repeat=1000
+
+$EXE -m=700 -n=80 -e=1e-12 -v=1  -prec_i=fp16 -repeat=1000
+$EXE -m=700 -n=128 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000
+$EXE -m=700 -n=144 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000
+$EXE -m=700 -n=168 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000
+$EXE -m=700 -n=184 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000
+$EXE -m=700 -n=256 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000
+$EXE -m=700 -n=288 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000
+$EXE -m=700 -n=344 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000
+$EXE -m=700 -n=376 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000
+$EXE -m=700 -n=448 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000
+$EXE -m=700 -n=512 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000
+$EXE -m=700 -n=924 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000
+$EXE -m=700 -n=1024 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000
+$EXE -m=700 -n=1078 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000
+$EXE -m=700 -n=1996 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000
+$EXE -m=700 -n=4080 -e=1e-12 -v=1 -prec_i=fp16 -repeat=1000
--- a/example/ck_tile/02_layernorm2d/script/smoke_test.sh
+++ b/example/ck_tile/02_layernorm2d/script/smoke_test.sh
@@ -0,0 +1,38 @@
+#!/bin/sh
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+EXE="$(find . -name tile_example_layernorm2d_fwd -type f | head -n 1)"
+
+for fquant in "" "-fquant=1 -prec_o=int8" "-fquant=1 -prec_o=fp8"; do
+for pr_i in "fp16" "bf16" ; do
+for fadd in "0" "1"; do
+$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=99  -n=13
+$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=17  -n=16
+$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=1   -n=100
+$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=4   -n=128
+$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=80  -n=127
+$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=22  -n=255 -stride=256
+$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=7   -n=599
+$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=19  -n=512
+$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=33  -n=313 -stride=1000
+$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=11  -n=510
+$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=171 -n=676 -stride=818
+$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=91  -n=636
+$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=12  -n=768 -stride=800
+$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=100 -n=766 -stride=812
+$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=31  -n=1024
+$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=64  -n=1000 -stride=1004
+$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=8   -n=1501
+$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=3   -n=1826
+$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=5   -n=2040
+$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=7   -n=2734
+$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=1   -n=3182
+$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=9   -n=4096
+$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=3   -n=8192
+$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=3   -n=9120
+$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=1   -n=10547
+#$EXE -prec_i=$pr_i -fadd=$fadd $fquant -m=3   -n=17134
+done
+done
+done
--- a/example/ck_tile/03_gemm/CMakeLists.txt
+++ b/example/ck_tile/03_gemm/CMakeLists.txt
@@ -0,0 +1,25 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx90a")
+  add_executable(tile_example_gemm_basic gemm_basic.cpp)
+  add_executable(tile_example_gemm_universal universal_gemm.cpp)
+  add_executable(tile_example_gemm_weight_preshuffle gemm_weight_preshuffle.cpp)
+  add_executable(tile_example_gemm_reduce gemm_splitk_two_stage_reduce.cpp)
+  add_executable(tile_example_gemm_splitk_two_stage gemm_splitk_two_stage.cpp)
+  set(EXAMPLE_GEMM_COMPILE_OPTIONS)
+  set(EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS)
+  if(CK_USE_OCP_FP8)
+    list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
+  endif()
+  list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -mllvm -enable-noalias-to-md-conversion=0)
+  list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS -Wno-unused-local-typedef)
+  list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS -Wno-gnu-line-marker)
+  list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS --save-temps)
+  list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm -enable-noalias-to-md-conversion=0")
+  target_compile_options(tile_example_gemm_basic PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+  target_compile_options(tile_example_gemm_universal PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+  target_compile_options(tile_example_gemm_weight_preshuffle PRIVATE ${EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS})
+  target_compile_options(tile_example_gemm_reduce PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+  target_compile_options(tile_example_gemm_splitk_two_stage PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+endif()
--- a/example/ck_tile/03_gemm/README.md
+++ b/example/ck_tile/03_gemm/README.md
@@ -0,0 +1,98 @@
+# GEMM with CK Tile
+
+This example demonstrates matrix multiplication (GEMM) using the CK Tile programming model, focusing on tile-based parallelism and modular kernel design.
+
+---
+
+## Algorithm and Math
+
+GEMM computes:
+$$
+C = A \times B
+$$
+where $A$ is $[M, K]$, $B$ is $[N, K]$, and $C$ is $[M, N]$.
+
+- **BlockTile GEMM**: Each Block Tile computes a tile of $C$ by loading tiles of $A$ and $B$, performing blockwise matrix multiply-accumulation, and writing results back with the epilogue.
+
+---
+
+## Tile Programming Model
+
+- **Configuration**: The Configuration of how the kernel going to be initialized with Block Tile Dimension, Warps Layout, Warp Tile Dimension, and other improvements.
+- **Block Tile**: Each block tile allocates in the compute unit of AMD GPU grabbing the .
+- **Pipeline**: Modular design allows swapping different memory/computation pipelines (e.g., basic, memory-bound, compute).
+- **Block GEMM**: Block Level implementation on how to coordinate the warps iteration and memory layout in block tile.
+- **Warp GEMM**: Each Warp's GEMM Calculation
+- **Epilogue**: Transferring the Accumulated result from register to global memory.
+
+---
+
+## Features
+
+- **Flexible Layouts**: Supports row/column-major and custom strides for $A$, $B$, $C$.
+- **Split K**: Split the Block Tile also on K Dimension and add it back after the matrix multiply-accumulation. Have a higher performance when M and N is small and K is large.
+- **Preshuffled GEMM**: In inference task, shuffle the GEMM of B (weight) matrix in the warp layout and bypass the shared memory to do the GEMM calculation. Best performance solution for GEMM.
+- **Precision**: Supports fp16, bf16, fp8, bf8, int4 (for B Matrix).
+- **Validation**: CPU/GPU validation and error tolerance options.
+
+---
+
+## Build & Run
+
+```bash
+mkdir build && cd build
+# you can replace <arch> with the appropriate architecture (for example gfx90a or gfx942) or leave it blank
+../script/cmake-ck-dev.sh  ../ <arch>
+# The basic pipeline method on the gemm calculation
+make tile_example_gemm_basic -j`nproc`
+# The memory bound pipeline on the gemm calculation
+make tile_example_gemm_universal -j`nproc`
+# The weight preshuffle pipeline on the gemm calculation
+make tile_example_gemm_weight_preshuffle -j`nproc`
+```
+This will result in an executable `build/bin/tile_example_gemm_basic` & `build/bin/tile_example_gemm_universal`
+
+## example
+```
+args:
+          -m    m dimension (default:1024)
+          -n    n dimension (default:2048)
+          -k    k dimension (default:64)
+   -a_layout    Tensor A data layout (default: R)
+   -b_layout    Tensor B data layout (default: C)
+   -c_layout    Tensor C data layout (default: R)
+   -stride_a    Tensor A stride (default:0)
+   -stride_b    Tensor B stride (default:0)
+   -stride_c    Tensor C stride (default:0)
+          -v    0. No validation, 1. Validation on CPU, 2. Validation on GPU (default:2)
+       -prec    data type. fp16/bf16/fp8/bf8 (default:fp16)
+     -warmup    number of iterations before benchmark the kernel (default:50)
+     -repeat    number of iterations to benchmark the kernel (default:100)
+      -timer    gpu:gpu timer, cpu:cpu timer (default:gpu)
+          -split_k    splitK value (default:1)
+       -init    0:random, 1:linear, 2:constant(1) (default:0)
+ -persistent    0:non-persistent, 1:persistent (default:0)
+       -json    0: No Json, 1: Dump Results in Json format (default:0)
+   -jsonfile    json file name to dump results (default:gemm.json)
+```
+
+
+## Source Structure
+
+- **Executables**: `gemm_basic.cpp`, `universal_gemm.cpp` (different kinds of GEMM implementation)
+- **Utils**: `gemm_utils.hpp` (helper functions)
+- **Build**: `CMakeLists.txt`, `run_gemm_example.inc`
+- **Scripts**: `script/` (build and run helpers)
+
+---
+
+## Related CK Tile Examples
+
+- [01_fmha](../01_fmha/README.md): Fused multi-head attention (FMHA)
+- [18_flatmm](../18_flatmm/README.md): Preshuffled GEMM alternative solution
+- [16_batched_gemm](../16_batched_gemm/README.md): Batched GEMM with tiles
+
+For distribution, see `include/ck_tile/tile_program/tile_distribution/`.
+
+---
+[Back to CK Tile Examples](../README.md)
--- a/example/ck_tile/03_gemm/gemm_basic.cpp
+++ b/example/ck_tile/03_gemm/gemm_basic.cpp
@@ -0,0 +1,107 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "gemm_utils.hpp"
+#include "run_gemm_example.inc"
+#include "run_gemm_example_common.hpp"
+#include "gemm_basic_invoker.hpp"
+#include "ck_tile/core/utility/gemm_validation.hpp"
+
+int run_gemm_example(ck_tile::ArgParser& arg_parser)
+{
+    std::string data_type = arg_parser.get_str("prec");
+    std::string a_layout  = arg_parser.get_str("a_layout");
+    std::string b_layout  = arg_parser.get_str("b_layout");
+    std::string c_layout  = arg_parser.get_str("c_layout");
+
+    std::tuple<ck_tile::index_t, ck_tile::index_t, ck_tile::index_t> gemm_sizes =
+        parse_gemm_size(arg_parser);
+
+    int m = std::get<0>(gemm_sizes);
+    int n = std::get<1>(gemm_sizes);
+    int k = std::get<2>(gemm_sizes);
+
+    int stride_a = arg_parser.get_int("stride_a");
+    int stride_b = arg_parser.get_int("stride_b");
+    int stride_c = arg_parser.get_int("stride_c");
+
+    using GemmConfig = GemmConfigBase;
+    using Invoker    = BasicInvoker;
+
+    ck_tile::validate_gemm_stride(
+        a_layout, b_layout, c_layout, m, n, k, stride_a, stride_b, stride_c);
+
+    if(data_type == "fp16")
+    {
+        return run_gemm_example_prec_type<GemmConfig, Invoker, ck_tile::half_t>(
+            a_layout, b_layout, arg_parser);
+    }
+    else if(data_type == "bf16")
+    {
+        return run_gemm_example_prec_type<GemmConfig, Invoker, ck_tile::bf16_t>(
+            a_layout, b_layout, arg_parser);
+    }
+    else if(data_type == "fp8")
+    {
+        return run_gemm_example_prec_type<GemmConfig,
+                                          Invoker,
+                                          ck_tile::fp8_t,
+                                          ck_tile::fp8_t,
+                                          ck_tile::half_t>(a_layout, b_layout, arg_parser);
+    }
+    else if(data_type == "bf8")
+    {
+        return run_gemm_example_prec_type<GemmConfig,
+                                          Invoker,
+                                          ck_tile::bf8_t,
+                                          ck_tile::bf8_t,
+                                          ck_tile::half_t>(a_layout, b_layout, arg_parser);
+    }
+    else if(data_type == "i8")
+    {
+        return run_gemm_example_prec_type<GemmConfig,
+                                          Invoker,
+                                          ck_tile::int8_t,
+                                          ck_tile::int8_t,
+                                          int32_t>(a_layout, b_layout, arg_parser);
+    }
+    else if(data_type == "pk_int4_t")
+    {
+        // TODO: Add support for bhalf_t ADataType
+        if constexpr(GemmConfig::Pipeline == ck_tile::GemmPipeline::COMPUTE_V3)
+        {
+            return run_gemm_example_prec_type<GemmConfig,
+                                              Invoker,
+                                              ck_tile::half_t,
+                                              ck_tile::pk_int4_t,
+                                              ck_tile::half_t>(a_layout, b_layout, arg_parser);
+        }
+        else
+        {
+            throw std::runtime_error("Unsupported data type for this operation !!!");
+        }
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported data type for this operation !!!");
+    }
+}
+
+int main(int argc, char* argv[])
+{
+    auto arg_parser = create_args();
+    auto result     = arg_parser.parse(argc, argv);
+
+    if(!result)
+        return -1;
+
+    try
+    {
+        return !run_gemm_example(arg_parser);
+    }
+    catch(const std::runtime_error& e)
+    {
+        std::cerr << "Runtime error: " << e.what() << '\n';
+        return EXIT_FAILURE;
+    }
+}
--- a/example/ck_tile/03_gemm/gemm_basic_invoker.hpp
+++ b/example/ck_tile/03_gemm/gemm_basic_invoker.hpp
@@ -0,0 +1,156 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+#pragma once
+#include "gemm_utils.hpp"
+
+struct BasicInvoker
+{
+    template <typename GemmConfig,
+              typename ADataType,
+              typename BDataType,
+              typename DsDataType,
+              typename AccDataType,
+              typename CDataType,
+              typename ALayout,
+              typename BLayout,
+              typename DsLayout,
+              typename CLayout,
+              bool Persistent,
+              typename CDEElementWise>
+    static float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
+    {
+        if constexpr(Persistent)
+        {
+            std::cout << "WARNING: Ignoring persistent kernel option for basic gemm." << std::endl;
+        }
+
+        // This part comes from the Codegen
+        constexpr ck_tile::index_t M_Tile = 256;
+        constexpr ck_tile::index_t N_Tile = 256;
+        constexpr ck_tile::index_t K_Tile = 64;
+
+#if CK_TILE_USE_WMMA
+        constexpr ck_tile::index_t M_Warp = 4;
+        constexpr ck_tile::index_t N_Warp = 2;
+        constexpr ck_tile::index_t K_Warp = 1;
+
+        constexpr ck_tile::index_t M_Warp_Tile = 16;
+        constexpr ck_tile::index_t N_Warp_Tile = 16;
+        constexpr ck_tile::index_t K_Warp_Tile = 16;
+#else
+        constexpr ck_tile::index_t M_Warp = 2;
+        constexpr ck_tile::index_t N_Warp = 2;
+        constexpr ck_tile::index_t K_Warp = 1;
+
+        constexpr ck_tile::index_t M_Warp_Tile = 32;
+        constexpr ck_tile::index_t N_Warp_Tile = 32;
+        constexpr ck_tile::index_t K_Warp_Tile = 16;
+#endif
+
+        using CodegenGemmShape =
+            ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
+                                   ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
+                                   ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
+
+        using TilePartitioner = ck_tile::GemmTile1DPartitioner<CodegenGemmShape>;
+
+        using CodegenGemmTraits = ck_tile::TileGemmTraits<GemmConfig::kPadM,
+                                                          GemmConfig::kPadN,
+                                                          GemmConfig::kPadK,
+                                                          ALayout,
+                                                          BLayout,
+                                                          CLayout>;
+
+        using CodegenPipelineProblem = ck_tile::GemmPipelineProblem<ADataType,
+                                                                    BDataType,
+                                                                    AccDataType,
+                                                                    CodegenGemmShape,
+                                                                    CodegenGemmTraits>;
+
+        using CodegenGemmPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem>;
+
+        using GemmEpilogue = ck_tile::CShuffleEpilogue<
+            ck_tile::CShuffleEpilogueProblem<ADataType,
+                                             BDataType,
+                                             ck_tile::tuple<>,
+                                             AccDataType,
+                                             CDataType,
+                                             ck_tile::tuple<>,
+                                             CLayout,
+                                             ck_tile::element_wise::PassThrough,
+                                             TilePartitioner::MPerBlock,
+                                             TilePartitioner::NPerBlock,
+                                             M_Warp,
+                                             N_Warp,
+                                             M_Warp_Tile,
+                                             N_Warp_Tile,
+                                             K_Warp_Tile,
+                                             CodegenPipelineProblem::TransposeC>>;
+
+        // ToDo: Will add the codegen part to test different pipeline policies in GEMM.
+        // Now we only use the BlockGemmASmemBSmemCRegV1DefaultPolicy.
+        using Kernel = ck_tile::GemmKernel<TilePartitioner, CodegenGemmPipeline, GemmEpilogue>;
+        auto kargs   = Kernel::MakeKernelArgs(args);
+
+        const dim3 grids  = Kernel::GridSize(args.M, args.N, args.k_batch);
+        const dim3 blocks = Kernel::BlockSize();
+
+        if(!Kernel::IsSupportedArgument(kargs))
+        {
+            throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
+        }
+
+        if(s.log_level_ > 0)
+        {
+            std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
+                      << "shape: " << CodegenGemmShape::GetName() << '\n'
+                      << "problem: " << CodegenPipelineProblem::GetName() << '\n'
+                      << "pipeline: " << CodegenGemmPipeline::GetName() << '\n'
+                      << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
+                      << std::endl;
+        }
+
+        // Declare rotating_mem_ptr here so it stays in scope until it is needed
+        std::unique_ptr<ck_tile::RotatingMemWrapper<ADataType, BDataType>> rotating_mem_ptr;
+        std::function<void()> preprocess;
+
+        auto clear_gemm_output = [&]() {
+            if(args.k_batch > 1)
+                hipGetErrorString(hipMemsetAsync(
+                    args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
+        };
+
+        if(s.flush_cache_)
+        {
+            std::cout << "Flushing cache..." << std::endl;
+
+            ck_tile::HostTensor<ADataType> a_m(ck_tile::host_tensor_descriptor(
+                args.M, args.K, args.stride_A, is_row_major(ALayout{})));
+            ck_tile::HostTensor<BDataType> b_n(ck_tile::host_tensor_descriptor(
+                args.K, args.N, args.stride_B, is_row_major(BLayout{})));
+
+            auto size_a_buffer = a_m.get_element_space_size_in_bytes();
+            auto size_b_buffer = b_n.get_element_space_size_in_bytes();
+
+            rotating_mem_ptr = std::make_unique<ck_tile::RotatingMemWrapper<ADataType, BDataType>>(
+                kargs.as_ptr[0], kargs.bs_ptr[0], s.rotating_count_, size_a_buffer, size_b_buffer);
+            rotating_mem_ptr->Print();
+
+            preprocess = [&]() {
+                ck_tile::flush_icache();
+                rotating_mem_ptr->Next();
+                clear_gemm_output();
+            };
+        }
+        else
+        {
+            preprocess = clear_gemm_output;
+        }
+
+        return ck_tile::launch_kernel_time_mask(
+            s,
+            preprocess,
+            ck_tile::make_kernel<GemmConfig::kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+    }
+};
--- a/example/ck_tile/03_gemm/gemm_splitk_two_stage.cpp
+++ b/example/ck_tile/03_gemm/gemm_splitk_two_stage.cpp
@@ -0,0 +1,57 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "gemm_utils.hpp"
+#include "run_gemm_example.inc"
+#include "run_gemm_example_common.hpp"
+#include "gemm_splitk_two_stage_invoker.hpp"
+
+template <template <typename PreType, typename WorkspaceType> typename GemmConfig>
+int run_gemm_example(ck_tile::ArgParser& arg_parser)
+{
+    std::string data_type = arg_parser.get_str("prec");
+    std::string a_layout  = arg_parser.get_str("a_layout");
+    std::string b_layout  = arg_parser.get_str("b_layout");
+
+    using Invoker = SplitKTwoStageInvoker;
+
+    if(data_type == "fp16")
+    {
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::half_t, float>,
+                                          Invoker,
+                                          ck_tile::half_t>(a_layout, b_layout, arg_parser);
+    }
+    else if(data_type == "bf16")
+    {
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::bf16_t, float>,
+                                          Invoker,
+                                          ck_tile::bf16_t>(a_layout, b_layout, arg_parser);
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported data type for this operation !!!");
+    }
+}
+
+int main(int argc, char* argv[])
+{
+    auto arg_parser = create_args();
+    auto result     = arg_parser.parse(argc, argv);
+
+    if(!result)
+        return -1;
+
+    try
+    {
+#if CK_TILE_USE_WMMA
+        return !run_gemm_example<GemmConfigTwoStage_Wmma>(arg_parser);
+#else
+        return !run_gemm_example<GemmConfigTwoStage>(arg_parser);
+#endif
+    }
+    catch(const std::runtime_error& e)
+    {
+        std::cerr << "Runtime error: " << e.what() << '\n';
+        return EXIT_FAILURE;
+    }
+}
--- a/example/ck_tile/03_gemm/gemm_splitk_two_stage_invoker.hpp
+++ b/example/ck_tile/03_gemm/gemm_splitk_two_stage_invoker.hpp
@@ -0,0 +1,215 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+#pragma once
+#include "gemm_utils.hpp"
+#include "ck_tile/ops/elementwise.hpp"
+
+template <typename PrecType_, typename WorkspaceType_>
+struct GemmConfigTwoStage : public GemmConfigComputeV3<PrecType_>
+{
+    using WorkspaceType = ck_tile::remove_cvref_t<WorkspaceType_>;
+};
+
+template <typename PrecType_, typename WorkspaceType_>
+struct GemmConfigTwoStage_Wmma : public GemmConfigComputeV3_WMMA<PrecType_>
+{
+    using WorkspaceType = ck_tile::remove_cvref_t<WorkspaceType_>;
+};
+
+struct SplitKTwoStageInvoker
+{
+    template <typename GemmConfig,
+              typename ADataType,
+              typename BDataType,
+              typename DsDataType,
+              typename AccDataType,
+              typename CDataType,
+              typename ALayout,
+              typename BLayout,
+              typename DsLayout,
+              typename ELayout,
+              bool Persistent,
+              typename CDEElementWise>
+    static float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
+
+    {
+        using GemmShape = ck_tile::TileGemmShape<
+            ck_tile::sequence<GemmConfig::M_Tile, GemmConfig::N_Tile, GemmConfig::K_Tile>,
+            ck_tile::sequence<GemmConfig::M_Warp, GemmConfig::N_Warp, GemmConfig::K_Warp>,
+            ck_tile::
+                sequence<GemmConfig::M_Warp_Tile, GemmConfig::N_Warp_Tile, GemmConfig::K_Warp_Tile>,
+            GemmConfig::PermuteA,
+            GemmConfig::PermuteB>;
+
+        using TilePartitioner =
+            ck_tile::GemmSpatiallyLocalTilePartitioner<GemmShape,
+                                                       GemmConfig::TileParitionerGroupNum,
+                                                       GemmConfig::TileParitionerM01>;
+
+        using GemmUniversalTraits =
+            ck_tile::TileGemmUniversalTraits<GemmConfig::kPadM,
+                                             GemmConfig::kPadN,
+                                             GemmConfig::kPadK,
+                                             GemmConfig::DoubleSmemBuffer,
+                                             ALayout,
+                                             BLayout,
+                                             ELayout,
+                                             GemmConfig::TransposeC,
+                                             GemmConfig::UseStructuredSparsity,
+                                             Persistent,
+                                             GemmConfig::NumWaveGroups,
+                                             GemmConfig::Preshuffle>;
+        constexpr auto scheduler = GemmConfig::Scheduler;
+
+        using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                                           BDataType,
+                                                                           AccDataType,
+                                                                           GemmShape,
+                                                                           GemmUniversalTraits,
+                                                                           scheduler>;
+        using WorkspaceType        = ck_tile::remove_cvref_t<typename GemmConfig::WorkspaceType>;
+
+        using GemmPipeline = typename PipelineTypeTraits<
+            GemmConfig::Pipeline>::template GemmPipeline<UniversalGemmProblem>;
+
+        using GemmEpilogue = ck_tile::CShuffleEpilogue<
+            ck_tile::CShuffleEpilogueProblem<ADataType,
+                                             BDataType,
+                                             DsDataType,
+                                             AccDataType,
+                                             WorkspaceType,
+                                             DsLayout,
+                                             ELayout,
+                                             CDEElementWise,
+                                             TilePartitioner::MPerBlock,
+                                             TilePartitioner::NPerBlock,
+                                             GemmConfig::M_Warp,
+                                             GemmConfig::N_Warp,
+                                             GemmConfig::M_Warp_Tile,
+                                             GemmConfig::N_Warp_Tile,
+                                             GemmConfig::K_Warp_Tile,
+                                             UniversalGemmProblem::TransposeC,
+                                             GemmConfig::NumWaveGroups>>;
+
+        using GemmKernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
+
+        ck_tile::DeviceMem ws_m_n_dev_buf(args.M * args.N * sizeof(WorkspaceType));
+        ck_tile::GemmHostArgs ws_args = ck_tile::GemmHostArgs(args);
+        auto c_ptr                    = ws_args.c_ptr;
+        ws_args.c_ptr                 = ws_m_n_dev_buf.GetDeviceBuffer();
+        auto gemm_kargs               = GemmKernel::MakeKernelArgs(ws_args);
+
+        const dim3 grids  = Persistent ? GemmKernel::MaxOccupancyGridSize(s)
+                                       : GemmKernel::GridSize(args.M, args.N, args.k_batch);
+        const dim3 blocks = GemmKernel::BlockSize();
+
+        if(!GemmKernel::IsSupportedArgument(gemm_kargs))
+        {
+            throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
+        }
+
+        using XElementwiseOperation = ck_tile::element_wise::UnaryConvert;
+        using BlockTile             = ck_tile::sequence<2048>;
+        using BlockWarps            = ck_tile::sequence<8>;
+        using WarpTile              = ck_tile::sequence<64>;
+
+        using ElementwiseShape =
+            ck_tile::ElementWiseShape<BlockWarps, BlockTile, WarpTile, WorkspaceType>;
+        using Problem = ck_tile::ElementWisePipelineProblem<WorkspaceType,
+                                                            WorkspaceType,
+                                                            CDataType,
+                                                            ElementwiseShape,
+                                                            XElementwiseOperation>;
+        using ElementwiseKernel =
+            ck_tile::ElementWiseKernel<Problem, ck_tile::ElementWiseDefaultPolicy>;
+
+        ck_tile::index_t total_elements     = 1;
+        std::vector<ck_tile::index_t> shape = {args.M, args.N};
+
+        for(auto d : shape)
+            total_elements *= d;
+
+        const ck_tile::index_t kBlockSize      = ElementwiseKernel::BlockSize();
+        constexpr ck_tile::index_t kBlockPerCu = 1;
+
+        constexpr ck_tile::index_t elements_per_block = BlockTile::at(ck_tile::number<0>{});
+        ck_tile::index_t kGridSize = (total_elements + elements_per_block - 1) / elements_per_block;
+
+        auto input_tensors = ck_tile::make_tuple(static_cast<WorkspaceType*>(ws_args.c_ptr));
+        auto input_size    = ck_tile::make_tuple(args.M, args.N);
+
+        // Check if the kernel configuration is supported
+        if(!ElementwiseKernel::IsSupportedArgument(input_size))
+        {
+            throw std::runtime_error(
+                "Wrong! Elementwise arguments not supported! Skipping gemm!\n");
+        }
+
+        if(s.log_level_ > 0)
+        {
+            std::cout << "Launching kernel with args: " << GemmKernel::GetName() << '\n'
+                      << "shape: " << GemmShape::GetName() << '\n'
+                      << "problem: " << UniversalGemmProblem::GetName() << '\n'
+                      << "pipeline: " << GemmPipeline::GetName() << '\n'
+                      << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
+                      << std::endl;
+        }
+
+        // Declare rotating_mem_ptr here so it stays in scope until it is needed
+        std::unique_ptr<ck_tile::RotatingMemWrapper<ADataType, BDataType>> rotating_mem_ptr;
+        std::function<void()> preprocess;
+
+        auto clear_gemm_output = [&]() {
+            if(args.k_batch > 1)
+                hipGetErrorString(hipMemsetAsync(
+                    ws_args.c_ptr, 0, args.M * args.N * sizeof(WorkspaceType), s.stream_id_));
+        };
+
+        if(s.flush_cache_)
+        {
+            std::cout << "Flushing cache..." << std::endl;
+
+            ck_tile::HostTensor<ADataType> a_m(ck_tile::host_tensor_descriptor(
+                args.M, args.K, args.stride_A, is_row_major(ALayout{})));
+            ck_tile::HostTensor<BDataType> b_n(ck_tile::host_tensor_descriptor(
+                args.K, args.N, args.stride_B, is_row_major(BLayout{})));
+
+            auto size_a_buffer = a_m.get_element_space_size_in_bytes();
+            auto size_b_buffer = b_n.get_element_space_size_in_bytes();
+
+            rotating_mem_ptr = std::make_unique<ck_tile::RotatingMemWrapper<ADataType, BDataType>>(
+                gemm_kargs.as_ptr[0],
+                gemm_kargs.bs_ptr[0],
+                s.rotating_count_,
+                size_a_buffer,
+                size_b_buffer);
+            rotating_mem_ptr->Print();
+
+            preprocess = [&]() {
+                ck_tile::flush_icache();
+                rotating_mem_ptr->Next();
+                clear_gemm_output();
+            };
+        }
+        else
+        {
+            preprocess = clear_gemm_output;
+        }
+
+        return ck_tile::launch_kernel_time_mask(
+            s,
+            preprocess,
+            ck_tile::make_kernel<GemmConfig::kBlockPerCu>(
+                GemmKernel{}, grids, blocks, 0, gemm_kargs),
+            ck_tile::make_kernel<kBlockPerCu>(ElementwiseKernel{},
+                                              kGridSize,
+                                              kBlockSize,
+                                              0,
+                                              input_size,
+                                              ck_tile::make_tuple(args.N, 1), // Input Stride
+                                              ck_tile::make_tuple(args.N, 1), // Output Stride
+                                              input_tensors,
+                                              static_cast<CDataType*>(c_ptr)));
+    }
+};
--- a/example/ck_tile/03_gemm/gemm_splitk_two_stage_reduce.cpp
+++ b/example/ck_tile/03_gemm/gemm_splitk_two_stage_reduce.cpp
@@ -0,0 +1,963 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include <hip/hip_runtime.h>
+
+#include <cstring>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <tuple>
+
+#include "ck_tile/host.hpp"
+#include "ck_tile/ops/common/utils.hpp"
+#include "ck_tile/ops/reduce.hpp"
+#include "ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp"
+#include "gemm_utils.hpp"
+#include "run_gemm_example.inc"
+
+/**
+ * @brief Tile partitioner with output offset support.
+ *
+ * This partitioner extends the spatially local tile partitioner to support
+ * split-K reduction by providing workspace output offset calculation. Each K-split
+ * writes to a separate slice of the workspace: workspace[k_id * M * N].
+ */
+template <typename BlockGemmShapeType, ck_tile::index_t GroupNum, ck_tile::index_t M01>
+struct GemmSplitKTilePartitioner
+    : public ck_tile::GemmSpatiallyLocalTilePartitioner<BlockGemmShapeType, GroupNum, M01>
+{
+    using Base = ck_tile::GemmSpatiallyLocalTilePartitioner<BlockGemmShapeType, GroupNum, M01>;
+
+    // Inherit constructors and methods
+    using Base::Base;
+    using Base::GetLoopNum;
+
+    /**
+     * @brief Calculate output pointer offset for split-K reduction.
+     *
+     * @param kargs  Kernel arguments.
+     * @param k_id   Current K-split ID (from blockIdx.z or calculated k_batch).
+     * @return ck_tile::index_t  The offset for this K-split.
+     */
+    template <typename KernelArgs>
+    CK_TILE_HOST_DEVICE static ck_tile::index_t GetOutputOffset(const KernelArgs& kargs,
+                                                                ck_tile::index_t k_id) noexcept
+    {
+        // Each K-split gets its own M*N workspace slice
+        return (kargs.k_batch > 1) ? (k_id * kargs.M * kargs.N) : 0;
+    }
+};
+
+/**
+ * @brief Extended GEMM host arguments for two-stage split-K implementation
+ *
+ * This structure supports the two-stage split-K approach where:
+ * 1. Stage 1: GEMM writes partial results to workspace memory
+ * 2. Stage 2: Reduction kernel sums workspace results to final output
+ *
+ * The base class e_ptr points to workspace, while final_output_ptr points to the actual output
+ */
+struct GemmSplitKHostArgs : public ck_tile::GemmHostArgs
+{
+    using BaseArgs = ck_tile::GemmHostArgs;
+
+    CK_TILE_HOST GemmSplitKHostArgs() = default;
+    CK_TILE_HOST GemmSplitKHostArgs(const void* a_ptr_,
+                                    const void* b_ptr_,
+                                    void* workspace_ptr_, // Workspace for partial results
+                                    void* e_ptr_,         // Final output destination
+                                    ck_tile::index_t k_batch_,
+                                    ck_tile::index_t M_,
+                                    ck_tile::index_t N_,
+                                    ck_tile::index_t K_,
+                                    ck_tile::index_t stride_A_,
+                                    ck_tile::index_t stride_B_,
+                                    ck_tile::index_t workspace_stride_,
+                                    ck_tile::index_t stride_E_)
+        : BaseArgs(a_ptr_,
+                   b_ptr_,
+                   workspace_ptr_, // Base e_ptr = workspace_ptr
+                   k_batch_,
+                   M_,
+                   N_,
+                   K_,
+                   stride_A_,
+                   stride_B_,
+                   workspace_stride_),
+          final_output_ptr(e_ptr_),
+          final_stride_E(stride_E_)
+    {
+    }
+
+    void* final_output_ptr;          // Pointer to final output tensor
+    ck_tile::index_t final_stride_E; // Stride for final output tensor
+};
+
+/**
+ * @brief Stage 1: GEMM kernel that writes partial split-K results to workspace
+ *
+ * This function performs the matrix multiplication with split-K, where each
+ * K-split writes its partial result to a separate section of the workspace.
+ *
+ * Workspace layout: [k_batch, M, N] where each [M, N] slice contains
+ * partial results for one K-split.
+ *
+ * @param args Extended arguments containing workspace and final output pointers
+ * @param s Stream configuration for kernel execution
+ * @return Execution time in milliseconds
+ */
+template <typename GemmConfig,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          bool Persistent,
+          typename CDEElementWise>
+float gemm_stage1(const GemmSplitKHostArgs& args, const ck_tile::stream_config& s)
+{
+    using GemmShape = ck_tile::TileGemmShape<
+        ck_tile::sequence<GemmConfig::M_Tile, GemmConfig::N_Tile, GemmConfig::K_Tile>,
+        ck_tile::sequence<GemmConfig::M_Warp, GemmConfig::N_Warp, GemmConfig::K_Warp>,
+        ck_tile::
+            sequence<GemmConfig::M_Warp_Tile, GemmConfig::N_Warp_Tile, GemmConfig::K_Warp_Tile>,
+        GemmConfig::PermuteA,
+        GemmConfig::PermuteB>;
+
+    using TilePartitioner = GemmSplitKTilePartitioner<GemmShape,
+                                                      GemmConfig::TileParitionerGroupNum,
+                                                      GemmConfig::TileParitionerM01>;
+
+    using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<GemmConfig::kPadM,
+                                                                 GemmConfig::kPadN,
+                                                                 GemmConfig::kPadK,
+                                                                 GemmConfig::DoubleSmemBuffer,
+                                                                 ALayout,
+                                                                 BLayout,
+                                                                 ELayout,
+                                                                 GemmConfig::TransposeC,
+                                                                 GemmConfig::UseStructuredSparsity,
+                                                                 Persistent,
+                                                                 GemmConfig::NumWaveGroups,
+                                                                 GemmConfig::Preshuffle>;
+
+    // Create base GEMM arguments pointing to workspace instead of final output
+    // The workspace will store partial results from each K-split
+    ck_tile::GemmHostArgs base_args(args.a_ptr,
+                                    args.b_ptr,
+                                    args.e_ptr,
+                                    args.k_batch,
+                                    args.M,
+                                    args.N,
+                                    args.K,
+                                    args.stride_A,
+                                    args.stride_B,
+                                    args.stride_E);
+    constexpr auto scheduler = GemmConfig::Scheduler;
+
+    using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                                       BDataType,
+                                                                       AccDataType,
+                                                                       GemmShape,
+                                                                       GemmUniversalTraits,
+                                                                       scheduler>;
+
+    using GemmPipeline = typename PipelineTypeTraits<GemmConfig::Pipeline>::template GemmPipeline<
+        UniversalGemmProblem>;
+
+    using GemmEpilogue =
+        ck_tile::CShuffleEpilogue<ck_tile::CShuffleEpilogueProblem<ADataType,
+                                                                   BDataType,
+                                                                   DsDataType,
+                                                                   AccDataType,
+                                                                   CDataType,
+                                                                   DsLayout,
+                                                                   ELayout,
+                                                                   CDEElementWise,
+                                                                   TilePartitioner::MPerBlock,
+                                                                   TilePartitioner::NPerBlock,
+                                                                   GemmConfig::M_Warp,
+                                                                   GemmConfig::N_Warp,
+                                                                   GemmConfig::M_Warp_Tile,
+                                                                   GemmConfig::N_Warp_Tile,
+                                                                   GemmConfig::K_Warp_Tile,
+                                                                   UniversalGemmProblem::TransposeC,
+                                                                   GemmConfig::NumWaveGroups>>;
+
+    using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
+    auto kargs   = Kernel::MakeKernelArgs(base_args);
+
+    dim3 grids;
+    if constexpr(Persistent)
+    {
+        grids = Kernel::MaxOccupancyGridSize(s);
+    }
+    else
+    {
+        grids = Kernel::GridSize(args.M, args.N, args.k_batch);
+    }
+    const dim3 blocks = Kernel::BlockSize();
+
+    if(!Kernel::IsSupportedArgument(kargs))
+    {
+        throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
+    }
+
+    if(s.log_level_ > 0)
+    {
+        std::cout << "Stage 1 - Launching GEMM kernel: " << Kernel::GetName() << '\n'
+                  << "shape: " << GemmShape::GetName() << '\n'
+                  << "problem: " << UniversalGemmProblem::GetName() << '\n'
+                  << "pipeline: " << GemmPipeline::GetName() << '\n'
+                  << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                  << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
+                  << std::endl;
+    }
+
+    if(s.flush_cache_)
+    {
+        std::cout << "Flushing cache..." << std::endl;
+
+        ck_tile::HostTensor<ADataType> a_m(ck_tile::host_tensor_descriptor(
+            args.M, args.K, args.stride_A, is_row_major(ALayout{})));
+        ck_tile::HostTensor<BDataType> b_n(ck_tile::host_tensor_descriptor(
+            args.K, args.N, args.stride_B, is_row_major(BLayout{})));
+
+        auto size_a_buffer = a_m.get_element_space_size_in_bytes();
+        auto size_b_buffer = b_n.get_element_space_size_in_bytes();
+
+        ck_tile::RotatingMemWrapper<ADataType, BDataType> rotating_mem(
+            kargs.as_ptr[0], kargs.bs_ptr[0], s.rotating_count_, size_a_buffer, size_b_buffer);
+        rotating_mem.Print();
+
+        auto run_flush_cache = [&]() {
+            // flush icache
+            ck_tile::flush_icache();
+            // rotating mem
+            rotating_mem.Next();
+            // clear c mem
+            if(args.k_batch > 1)
+                hipGetErrorString(hipMemsetAsync(
+                    args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
+        };
+        return ck_tile::launch_kernel_time_mask(
+            s,
+            run_flush_cache,
+            ck_tile::make_kernel<GemmConfig::kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+    }
+    else
+    {
+        return ck_tile::launch_kernel(
+            s, ck_tile::make_kernel<GemmConfig::kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+    }
+}
+
+/**
+ * @brief Stage 2: Reduction kernel that sums partial split-K results to final output
+ *
+ * This function reduces the partial results stored in workspace memory by stage 1.
+ * It sums across the k_batch dimension to produce the final GEMM result.
+ *
+ * Workspace layout: [k_batch, M, N] -> Final output: [M, N]
+ *
+ * @tparam CDataType Output data type
+ * @tparam ComputeDataType Computation precision for reduction
+ * @tparam ELayout Memory layout of output tensor
+ * @param args Extended arguments containing workspace and output information
+ * @param s Stream configuration for kernel execution
+ * @return Execution time in milliseconds
+ */
+template <typename CDataType,
+          typename ComputeDataType = float,
+          typename ELayout         = ck_tile::tensor_layout::gemm::RowMajor>
+float reduce_stage2(const GemmSplitKHostArgs& args, const ck_tile::stream_config& s)
+{
+    // Calculate output size based on the final output tensor dimensions
+    const ck_tile::index_t output_size = args.M * args.N;
+
+    // Workspace layout: [k_batch, M, N] where each [M, N] slice has the same layout as final output
+    // The workspace strides need to account for the layout of the final output tensor
+    auto workspace_shape = ck_tile::make_tuple(args.k_batch, args.M, args.N);
+    auto workspace_strides =
+        ck_tile::make_tuple(args.M * args.N,     // k_batch stride: jump to next K split
+                            args.final_stride_E, // stride same as final output stride
+                            1);
+
+    // Define kept and reduced dimensions
+    constexpr auto kept_dim    = ck_tile::sequence<1, 2>{}; // Keep M, N dimensions
+    constexpr auto reduce_dims = ck_tile::sequence<0>{};    // Reduce k_batch dimension
+
+    using ReduceOp   = ck_tile::ReduceOp::Add;
+    using BlockWarps = ck_tile::sequence<1, 1>;
+    using BlockTile  = ck_tile::sequence<256, 1>;
+    using WarpTile   = ck_tile::sequence<256, 1>;
+    using ThreadTile = ck_tile::sequence<1, 1>;
+
+    constexpr ck_tile::index_t kBlockPerCu = 1;
+
+    ck_tile::index_t kGridSize = (output_size + BlockTile::at(ck_tile::number<0>{}) - 1) /
+                                 BlockTile::at(ck_tile::number<0>{});
+
+    using Shape   = ck_tile::Reduce2dShape<BlockWarps, BlockTile, WarpTile, ThreadTile>;
+    using Problem = ck_tile::Reduce2dProblem<CDataType,
+                                             ComputeDataType,
+                                             CDataType,
+                                             Shape,
+                                             ReduceOp,
+                                             decltype(kept_dim),
+                                             decltype(reduce_dims),
+                                             3>;
+    using Kernel  = ck_tile::ReduceKernel<Problem>;
+    const ck_tile::index_t kBlockSize = Kernel::BlockSize();
+
+    if(s.log_level_ > 0)
+    {
+        std::cout << "Stage 2 - Launching Reduction kernel" << '\n'
+                  << "workspace shape: [" << args.k_batch << ", " << args.M << ", " << args.N << "]"
+                  << '\n'
+                  << "output shape: [" << args.M << ", " << args.N << "]" << '\n'
+                  << "grid size: " << kGridSize << std::endl;
+    }
+
+    float ave_time =
+        ck_tile::launch_kernel(s,
+                               ck_tile::make_kernel<kBlockPerCu>(
+                                   Kernel{},
+                                   kGridSize,
+                                   kBlockSize,
+                                   0,                                         // LDS size
+                                   static_cast<const CDataType*>(args.e_ptr), // workspace input
+                                   static_cast<CDataType*>(args.final_output_ptr), // final output
+                                   workspace_shape,
+                                   workspace_strides));
+
+    return ave_time;
+}
+
+/**
+ * @brief Orchestrator for two-stage split-K GEMM implementation
+ *
+ * This function coordinates the two-stage approach:
+ * 1. Stage 1: Execute GEMM with each K-split writing to workspace
+ * 2. Stage 2: Reduce workspace results to final output (if k_batch > 1)
+ *
+ * @param args Extended arguments for two-stage execution
+ * @param s Stream configuration
+ * @return Total execution time (GEMM + Reduction)
+ */
+template <typename GemmConfig,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          bool Persistent,
+          typename CDEElementWise>
+float gemm_splitk_two_stage(const GemmSplitKHostArgs& args, const ck_tile::stream_config& s)
+{
+    float gemm_time   = 0.0f;
+    float reduce_time = 0.0f;
+
+    if(s.log_level_ > 0)
+    {
+        std::cout << "Starting Two-Stage GEMM+SplitK with k_batch=" << args.k_batch << std::endl;
+        std::cout << "Workspace size: " << args.k_batch << " x " << args.M << " x " << args.N
+                  << " = " << args.k_batch * args.M * args.N * sizeof(CDataType) << " bytes"
+                  << std::endl;
+    }
+
+    // Stage 1: GEMM to workspace
+    gemm_time = gemm_stage1<GemmConfig,
+                            ADataType,
+                            BDataType,
+                            DsDataType,
+                            AccDataType,
+                            CDataType,
+                            ALayout,
+                            BLayout,
+                            DsLayout,
+                            ELayout,
+                            Persistent,
+                            CDEElementWise>(args, s);
+
+    // Synchronize before stage 2
+    auto sync_result = hipStreamSynchronize(s.stream_id_);
+    if(sync_result != hipSuccess)
+    {
+        throw std::runtime_error("Stream synchronization failed");
+    }
+
+    // Stage 2: Reduction from workspace to final output (if needed)
+    if(args.k_batch > 1)
+    {
+        // Use appropriate precision for reduction computations
+        using ComputeDataType = std::conditional_t<
+            std::is_same_v<CDataType, ck_tile::half_t>,
+            float,
+            std::conditional_t<std::is_same_v<CDataType, ck_tile::bf16_t>, float, CDataType>>;
+        reduce_time = reduce_stage2<CDataType, ComputeDataType, ELayout>(args, s);
+    }
+    else
+    {
+        // Single K-split: simple copy from workspace to final output
+        auto copy_result = hipMemcpyAsync(args.final_output_ptr,
+                                          args.e_ptr,
+                                          args.M * args.N * sizeof(CDataType),
+                                          hipMemcpyDeviceToDevice,
+                                          s.stream_id_);
+        if(copy_result != hipSuccess)
+        {
+            throw std::runtime_error("Memory copy failed");
+        }
+    }
+
+    if(s.log_level_ > 0)
+    {
+        std::cout << "GEMM stage time: " << gemm_time << " ms" << std::endl;
+        if(args.k_batch > 1)
+        {
+            std::cout << "Reduction stage time: " << reduce_time << " ms" << std::endl;
+        }
+        std::cout << "Total time: " << gemm_time + reduce_time << " ms" << std::endl;
+    }
+
+    return gemm_time + reduce_time;
+}
+
+/**
+ * @brief High-level interface for two-stage split-K GEMM execution
+ *
+ * @param a_m_k_dev_buf Input matrix A device buffer
+ * @param b_k_n_dev_buf Input matrix B device buffer
+ * @param c_m_n_dev_buf Output matrix C device buffer
+ * @param M Matrix M dimension
+ * @param N Matrix N dimension
+ * @param K Matrix K dimension
+ * @param stride_A Memory stride for matrix A
+ * @param stride_B Memory stride for matrix B
+ * @param stride_C Memory stride for matrix C
+ * @param kbatch Number of K-splits for split-K execution
+ * @param n_warmup Number of warmup iterations
+ * @param n_repeat Number of repeat iterations for benchmarking
+ * @param persistent Whether to use persistent kernel execution
+ * @return Average execution time in milliseconds
+ */
+template <typename GemmConfig,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename CLayout,
+          typename CDEElementWise = ck_tile::element_wise::PassThrough>
+float invoke_gemm_splitk_two_stage(ck_tile::DeviceMem& a_m_k_dev_buf,
+                                   ck_tile::DeviceMem& b_k_n_dev_buf,
+                                   ck_tile::DeviceMem& c_m_n_dev_buf,
+                                   ck_tile::index_t M,
+                                   ck_tile::index_t N,
+                                   ck_tile::index_t K,
+                                   ck_tile::index_t stride_A,
+                                   ck_tile::index_t stride_B,
+                                   ck_tile::index_t stride_C,
+                                   ck_tile::index_t kbatch,
+                                   int n_warmup,
+                                   int n_repeat,
+                                   bool persistent)
+{
+    // Calculate workspace size: kbatch * M * N elements
+    const ck_tile::index_t workspace_size   = kbatch * M * N * sizeof(CDataType);
+    const ck_tile::index_t workspace_stride = stride_C; // Stride for k_batch dimension
+
+    // Allocate workspace memory
+    ck_tile::DeviceMem workspace_buf(workspace_size);
+    workspace_buf.SetZero();
+
+    // Create extended args for two-stage approach
+    GemmSplitKHostArgs args{
+        a_m_k_dev_buf.GetDeviceBuffer(), // a_ptr
+        b_k_n_dev_buf.GetDeviceBuffer(), // b_ptr
+        workspace_buf.GetDeviceBuffer(), // workspace_ptr (used as e_ptr for stage 1)
+        c_m_n_dev_buf.GetDeviceBuffer(), // final_output_ptr
+        kbatch,                          // k_batch
+        M,
+        N,
+        K, // dimensions
+        stride_A,
+        stride_B,         // input strides
+        workspace_stride, // workspace stride
+        stride_C          // final output stride
+    };
+
+    float ave_time;
+    ck_tile::stream_config config{nullptr, true, 1, n_warmup, n_repeat, true, true, 50};
+
+    if(persistent)
+    {
+        ave_time = gemm_splitk_two_stage<GemmConfig,
+                                         ADataType,
+                                         BDataType,
+                                         DsDataType,
+                                         AccDataType,
+                                         CDataType,
+                                         ALayout,
+                                         BLayout,
+                                         DsLayout,
+                                         CLayout,
+                                         true,
+                                         CDEElementWise>(args, config);
+    }
+    else
+    {
+        ave_time = gemm_splitk_two_stage<GemmConfig,
+                                         ADataType,
+                                         BDataType,
+                                         DsDataType,
+                                         AccDataType,
+                                         CDataType,
+                                         ALayout,
+                                         BLayout,
+                                         DsLayout,
+                                         CLayout,
+                                         false,
+                                         CDEElementWise>(args, config);
+    }
+
+    std::size_t flop = std::size_t(2) * M * N * K;
+    std::size_t num_byte =
+        sizeof(ADataType) * M * K + sizeof(BDataType) * N * K + sizeof(CDataType) * M * N;
+    float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec = num_byte / 1.E6 / ave_time;
+
+    std::cout << "Run Two-Stage GEMM+SplitK with M=" << M << " N=" << N << " K=" << K
+              << " StrideA=" << stride_A << " StrideB=" << stride_B << " StrideC=" << stride_C
+              << " kbatch=" << kbatch << " WorkspaceSize=" << workspace_size << " bytes"
+              << " A_Layout=" << ALayout::name << " B_Layout =" << BLayout::name
+              << " C_Layout=" << CLayout::name
+              << " A_Type=" << ck_tile::DataTypeTraits<ADataType>::name
+              << " B_Type=" << ck_tile::DataTypeTraits<BDataType>::name
+              << " C_Type=" << ck_tile::DataTypeTraits<CDataType>::name
+              << " StructuredSparsity=" << (GemmConfig::UseStructuredSparsity ? "on" : "off")
+              << " Persistent=" << (persistent ? "on" : "off") << " : " << ave_time << " ms, "
+              << tflops << " TFlops, " << gb_per_sec << " GB/s" << std::endl;
+
+    return ave_time;
+}
+
+// Two-stage implementation of run_gemm_example_with_layouts
+template <typename GemmConfig,
+          typename ADataType,
+          typename BDataType = ADataType,
+          typename CDataType = ADataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout>
+int run_gemm_example_with_layouts_two_stage(ck_tile::ArgParser& arg_parser,
+                                            const ALayout a_layout                  = ALayout{},
+                                            const BLayout b_layout                  = BLayout{},
+                                            [[maybe_unused]] const CLayout c_layout = CLayout{})
+{
+    using AccDataType = typename GemmTypeConfig<ADataType, BDataType, CDataType>::AccDataType;
+
+    ck_tile::index_t M = arg_parser.get_int("m");
+    ck_tile::index_t N = arg_parser.get_int("n");
+    ck_tile::index_t K = arg_parser.get_int("k");
+
+    ck_tile::index_t stride_A = arg_parser.get_int("stride_a");
+    ck_tile::index_t stride_B = arg_parser.get_int("stride_b");
+    ck_tile::index_t stride_C = arg_parser.get_int("stride_c");
+
+    ck_tile::index_t kbatch      = arg_parser.get_int("split_k");
+    int n_warmup                 = arg_parser.get_int("warmup");
+    int n_repeat                 = arg_parser.get_int("repeat");
+    ck_tile::index_t init_method = arg_parser.get_int("init");
+    bool persistent              = arg_parser.get_int("persistent");
+
+    const bool preshuffle = GemmConfig::Preshuffle;
+
+    stride_A = ck_tile::get_default_stride(M, K, stride_A, is_row_major(a_layout));
+    stride_B = ck_tile::get_default_stride(K, N, stride_B, is_row_major(b_layout));
+    stride_C = ck_tile::get_default_stride(M, N, stride_C, is_row_major(CLayout{}));
+
+    ck_tile::HostTensor<ADataType> a_m_k(
+        ck_tile::host_tensor_descriptor(M, K, stride_A, is_row_major(a_layout)));
+    ck_tile::HostTensor<BDataType> b_k_n(
+        ck_tile::host_tensor_descriptor(K, N, stride_B, is_row_major(b_layout)));
+    ck_tile::HostTensor<CDataType> c_m_n_dev_result(
+        ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
+
+    if(init_method == 0)
+    {
+        if constexpr(preshuffle)
+        {
+            ck_tile::FillUniformDistribution<ADataType>{-.5f, .5f}(a_m_k);
+            ck_tile::FillUniformDistribution<BDataType>{-.5f, .5f}(b_k_n);
+        }
+        else
+        {
+            ck_tile::FillUniformDistribution<ADataType>{-5.f, 5.f}(a_m_k);
+            ck_tile::FillUniformDistribution<BDataType>{-5.f, 5.f}(b_k_n);
+        }
+    }
+    else if(init_method == 1)
+    {
+        ck_tile::FillMonotonicSeq<ADataType>{}(a_m_k);
+        ck_tile::FillMonotonicSeq<BDataType>{}(b_k_n);
+    }
+    else if(init_method == 2)
+    {
+        ck_tile::FillUniformDistribution<ADataType>{1.f, 1.f}(a_m_k);
+        ck_tile::FillUniformDistribution<BDataType>{1.f, 1.f}(b_k_n);
+    }
+    else
+    {
+        a_m_k.SetZero();
+        b_k_n.SetZero();
+    }
+
+    if(!preshuffle && GemmConfig::UseStructuredSparsity)
+    {
+        ck_tile::AdjustToStructuredSparsity<ADataType>{}(a_m_k);
+    }
+
+    ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes());
+
+    static_assert(!GemmConfig::PermuteA, "Not implemented");
+
+    if constexpr(preshuffle)
+    {
+        ck_tile::HostTensor<BDataType> b_shuffle_host = ck_tile::shuffle_b<GemmConfig>(b_k_n);
+        // shuffled buffer B for device implementation
+        b_k_n_dev_buf.ToDevice(b_shuffle_host.data());
+    }
+    else
+    {
+        if constexpr(std::is_same_v<BDataType, ck_tile::pk_int4_t>)
+        {
+            // Permute vector pk_i4x4 data for device implementation
+            ck_tile::HostTensor<BDataType> b_k_n_dev = b_k_n;
+            if constexpr(GemmConfig::PermuteB)
+            {
+                permute_tensor_b<GemmConfig,
+                                 decltype(b_k_n_dev),
+                                 ADataType,
+                                 BDataType,
+                                 AccDataType,
+                                 CDataType,
+                                 ALayout,
+                                 BLayout,
+                                 CLayout>(b_k_n_dev);
+            }
+            permute_vectors_i4x4_b(b_k_n_dev);
+            b_k_n_dev_buf.ToDevice(b_k_n_dev.data());
+        }
+        else
+        {
+            if constexpr(GemmConfig::PermuteB)
+            {
+                std::cout << "Permute for this DataType is not implemented." << std::endl;
+                return false;
+            }
+            b_k_n_dev_buf.ToDevice(b_k_n.data());
+        }
+    }
+
+    a_m_k_dev_buf.ToDevice(a_m_k.data());
+    c_m_n_dev_buf.SetZero();
+    c_m_n_dev_result.SetZero();
+
+    std::cout << "Using Workspace Split-K Mode (Two-Stage with Reduction)" << std::endl;
+    // Use the new two-stage approach
+    invoke_gemm_splitk_two_stage<GemmConfig,
+                                 ADataType,
+                                 BDataType,
+                                 ck_tile::tuple<>,
+                                 AccDataType,
+                                 CDataType,
+                                 ALayout,
+                                 BLayout,
+                                 ck_tile::tuple<>,
+                                 CLayout>(a_m_k_dev_buf,
+                                          b_k_n_dev_buf,
+                                          c_m_n_dev_buf,
+                                          M,
+                                          N,
+                                          K,
+                                          stride_A,
+                                          stride_B,
+                                          stride_C,
+                                          kbatch,
+                                          n_warmup,
+                                          n_repeat,
+                                          persistent);
+
+    c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
+    bool pass = true;
+
+    if(arg_parser.get_int("v") == 1)
+    {
+        ck_tile::HostTensor<CDataType> c_m_n_host_ref(
+            ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
+        c_m_n_host_ref.SetZero();
+
+        ck_tile::reference_gemm<ADataType, BDataType, AccDataType, CDataType>(
+            a_m_k, b_k_n, c_m_n_host_ref);
+        const float max_accumulated_value =
+            *std::max_element(c_m_n_host_ref.mData.begin(), c_m_n_host_ref.mData.end());
+        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
+            K, kbatch, max_accumulated_value);
+        pass = ck_tile::check_err(c_m_n_dev_result,
+                                  c_m_n_host_ref,
+                                  "Error: Incorrect results!",
+                                  rtol_atol.at(ck_tile::number<0>{}),
+                                  rtol_atol.at(ck_tile::number<1>{}));
+
+        std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
+                  << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
+                  << std::endl;
+        std::cout << "The CPU verification result is:" << (pass ? "correct" : "fail") << std::endl;
+    }
+    else if(arg_parser.get_int("v") == 2)
+    {
+        if constexpr(std::is_same_v<BDataType, ck_tile::pk_int4_t>)
+        {
+            // Restore input for B for gpu reference
+            b_k_n_dev_buf.ToDevice(b_k_n.data());
+        }
+        if constexpr(GemmConfig::Preshuffle)
+        {
+            b_k_n_dev_buf.ToDevice(b_k_n.data());
+        }
+
+        // memory on host to store gpu reference result
+        ck_tile::HostTensor<CDataType> c_m_n_gpu_ref(
+            ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
+        // memory on device to store gpu reference result
+        ck_tile::DeviceMem c_m_n_gpu_buf_ref(c_m_n_gpu_ref.get_element_space_size_in_bytes());
+
+        c_m_n_gpu_ref.SetZero();
+        c_m_n_gpu_buf_ref.SetZero();
+
+        ADataType* d_A = static_cast<ADataType*>(a_m_k_dev_buf.GetDeviceBuffer());
+        BDataType* d_B = static_cast<BDataType*>(b_k_n_dev_buf.GetDeviceBuffer());
+        CDataType* d_C = static_cast<CDataType*>(c_m_n_gpu_buf_ref.GetDeviceBuffer());
+
+        ck_tile::reference_gemm_gpu<ADataType,
+                                    BDataType,
+                                    AccDataType,
+                                    CDataType,
+                                    ALayout,
+                                    BLayout,
+                                    CLayout>(d_A, d_B, d_C, M, N, K, stride_A, stride_B, stride_C);
+
+        c_m_n_gpu_buf_ref.FromDevice(c_m_n_gpu_ref.data());
+
+        const float max_accumulated_value =
+            *std::max_element(c_m_n_gpu_ref.mData.begin(), c_m_n_gpu_ref.mData.end());
+        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
+            K, kbatch, max_accumulated_value);
+        pass = ck_tile::check_err(c_m_n_dev_result,
+                                  c_m_n_gpu_ref,
+                                  "Error: Incorrect results!",
+                                  rtol_atol.at(ck_tile::number<0>{}),
+                                  rtol_atol.at(ck_tile::number<1>{}));
+        std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
+                  << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
+                  << std::endl;
+        std::cout << "The GPU verification result is: " << (pass ? "correct" : "fail") << std::endl;
+    }
+
+    return pass;
+}
+
+template <typename GemmConfig,
+          typename APrecType,
+          typename BPrecType = APrecType,
+          typename CPrecType = APrecType>
+int run_gemm_example_prec_type(std::string a_layout,
+                               std::string b_layout,
+                               ck_tile::ArgParser& arg_parser)
+{
+    using Row       = ck_tile::tensor_layout::gemm::RowMajor;
+    using Col       = ck_tile::tensor_layout::gemm::ColumnMajor;
+    bool preshuffle = GemmConfig::Preshuffle;
+
+    if(preshuffle && std::is_same_v<BPrecType, ck_tile::pk_int4_t>)
+    {
+        throw std::runtime_error("Preshuffle is not supported for this int4 datatype!");
+    }
+
+    if(preshuffle && a_layout != "R" && b_layout != "C")
+    {
+        throw std::runtime_error(
+            "Preshuffle is supported only for A(Row major), B(column major) input matrices!");
+    }
+
+    // Use new two-stage approach for both int4 and other data types
+    if constexpr(std::is_same_v<BPrecType, ck_tile::pk_int4_t>)
+    {
+        if(a_layout == "R" && b_layout == "C")
+        {
+            return run_gemm_example_with_layouts_two_stage<GemmConfig,
+                                                           APrecType,
+                                                           BPrecType,
+                                                           CPrecType,
+                                                           Row,
+                                                           Col,
+                                                           Row>(arg_parser, Row{}, Col{}, Row{});
+        }
+        else if(a_layout == "C" && b_layout == "C")
+        {
+            return run_gemm_example_with_layouts_two_stage<GemmConfig,
+                                                           APrecType,
+                                                           BPrecType,
+                                                           CPrecType,
+                                                           Col,
+                                                           Col,
+                                                           Row>(arg_parser, Col{}, Col{}, Row{});
+        }
+        else
+        {
+            throw std::runtime_error("Unsupported memory layout for the input matrices when "
+                                     "BPrecType is ck_tile::pk_int4_t!");
+        }
+    }
+    else
+    {
+        if(a_layout == "R" && b_layout == "R")
+        {
+            return run_gemm_example_with_layouts_two_stage<GemmConfig,
+                                                           APrecType,
+                                                           BPrecType,
+                                                           CPrecType>(
+                arg_parser, Row{}, Row{}, Row{});
+        }
+        if(a_layout == "R" && b_layout == "C")
+        {
+            return run_gemm_example_with_layouts_two_stage<GemmConfig,
+                                                           APrecType,
+                                                           BPrecType,
+                                                           CPrecType>(
+                arg_parser, Row{}, Col{}, Row{});
+        }
+        else if(a_layout == "C" && b_layout == "R")
+        {
+            return run_gemm_example_with_layouts_two_stage<GemmConfig,
+                                                           APrecType,
+                                                           BPrecType,
+                                                           CPrecType>(
+                arg_parser, Col{}, Row{}, Row{});
+        }
+        else if(a_layout == "C" && b_layout == "C")
+        {
+            return run_gemm_example_with_layouts_two_stage<GemmConfig,
+                                                           APrecType,
+                                                           BPrecType,
+                                                           CPrecType>(
+                arg_parser, Col{}, Col{}, Row{});
+        }
+        else
+        {
+            throw std::runtime_error("Unsupported memory layout for the input matrices!");
+        }
+    }
+    return 0;
+}
+
+template <template <typename PreType> typename GemmConfig>
+int run_gemm_example(ck_tile::ArgParser& arg_parser)
+{
+    std::string data_type = arg_parser.get_str("prec");
+    std::string a_layout  = arg_parser.get_str("a_layout");
+    std::string b_layout  = arg_parser.get_str("b_layout");
+
+    if(data_type == "fp16")
+    {
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::half_t>, ck_tile::half_t>(
+            a_layout, b_layout, arg_parser);
+    }
+    else if(data_type == "bf16")
+    {
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::half_t>, ck_tile::bf16_t>(
+            a_layout, b_layout, arg_parser);
+    }
+    else if(data_type == "fp8")
+    {
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::fp8_t>,
+                                          ck_tile::fp8_t,
+                                          ck_tile::fp8_t,
+                                          ck_tile::half_t>(a_layout, b_layout, arg_parser);
+    }
+    else if(data_type == "bf8")
+    {
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::bf8_t>,
+                                          ck_tile::bf8_t,
+                                          ck_tile::bf8_t,
+                                          ck_tile::half_t>(a_layout, b_layout, arg_parser);
+    }
+    else if(data_type == "int8")
+    {
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::int8_t>,
+                                          ck_tile::int8_t,
+                                          ck_tile::int8_t,
+                                          ck_tile::int32_t>(a_layout, b_layout, arg_parser);
+    }
+    else if(data_type == "pk_int4_t")
+    {
+        // TODO: Add support for bhalf_t ADataType
+        if constexpr(GemmConfig<ck_tile::half_t>::Pipeline == ck_tile::GemmPipeline::COMPUTE_V3)
+        {
+            return run_gemm_example_prec_type<GemmConfig<ck_tile::half_t>,
+                                              ck_tile::half_t,
+                                              ck_tile::pk_int4_t,
+                                              ck_tile::half_t>(a_layout, b_layout, arg_parser);
+        }
+        else
+        {
+            throw std::runtime_error("Unsupported pipeline for this operation !!!");
+        }
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported data type for this operation !!!");
+    }
+    return 0;
+}
+
+int main(int argc, char* argv[])
+{
+    auto arg_parser = create_args();
+    auto result     = arg_parser.parse(argc, argv);
+
+    if(!result)
+        return -1;
+
+    try
+    {
+#if CK_TILE_USE_WMMA
+        return !run_gemm_example<GemmConfigComputeV3_WMMA>(arg_parser);
+#else
+        return !run_gemm_example<GemmConfigComputeV3>(arg_parser);
+#endif
+    }
+    catch(const std::runtime_error& e)
+    {
+        std::cerr << "Caught runtime error: " << e.what() << '\n';
+        // Return a non-zero code to indicate failure
+        return EXIT_FAILURE;
+    }
+    return EXIT_SUCCESS;
+}
--- a/example/ck_tile/03_gemm/gemm_utils.hpp
+++ b/example/ck_tile/03_gemm/gemm_utils.hpp
@@ -0,0 +1,516 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <string>
+#include <variant>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/core/numeric/pk_fp4.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/epilogue.hpp"
+#include "ck_tile/ops/gemm.hpp"
+#include "ck_tile/utility/json_dump.hpp"
+
+struct GemmConfigBase
+{
+    static constexpr bool kPadM = false;
+    static constexpr bool kPadN = false;
+    static constexpr bool kPadK = false;
+
+    static constexpr bool PermuteA = false;
+    static constexpr bool PermuteB = false;
+
+    static constexpr bool TransposeC            = false;
+    static constexpr bool UseStructuredSparsity = false;
+
+    static constexpr int kBlockPerCu                         = 1;
+    static constexpr ck_tile::index_t TileParitionerGroupNum = 8;
+    static constexpr ck_tile::index_t TileParitionerM01      = 4;
+    static constexpr auto Scheduler                 = ck_tile::GemmPipelineScheduler::Intrawave;
+    static constexpr ck_tile::GemmPipeline Pipeline = ck_tile::GemmPipeline::COMPUTE_V3;
+    static constexpr ck_tile::index_t NumWaveGroups = 1;
+    static constexpr bool Preshuffle                = false;
+    static constexpr bool TiledMMAPermuteN          = false;
+};
+
+template <typename PrecType>
+struct GemmConfigMemoryInterwave : public GemmConfigBase
+{
+    // Memory friendly for Interwave scheduler
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 32;
+    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 4;
+    static constexpr ck_tile::index_t N_Warp = 1;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = sizeof(PrecType) == 2 ? 8 : 16;
+
+    static constexpr bool DoubleSmemBuffer          = false;
+    static constexpr ck_tile::GemmPipeline Pipeline = ck_tile::GemmPipeline::MEMORY;
+    static constexpr auto Scheduler                 = ck_tile::GemmPipelineScheduler::Interwave;
+};
+
+template <typename PrecType>
+struct GemmConfigMemoryIntrawave : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 32;
+    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 4;
+    static constexpr ck_tile::index_t N_Warp = 1;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = sizeof(PrecType) == 2 ? 8 : 16;
+
+    static constexpr bool DoubleSmemBuffer          = false;
+    static constexpr ck_tile::GemmPipeline Pipeline = ck_tile::GemmPipeline::MEMORY;
+};
+
+template <typename PrecType>
+struct GemmConfigComputeV3 : public GemmConfigBase
+{
+    // Compute V3 only support Intrawave scheduler
+    static constexpr ck_tile::index_t M_Tile = 16;
+    static constexpr ck_tile::index_t N_Tile = 64;
+    static constexpr ck_tile::index_t K_Tile = 256 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 1;
+    static constexpr ck_tile::index_t N_Warp = 4;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 16;
+    static constexpr ck_tile::index_t N_Warp_Tile = 16;
+    static constexpr ck_tile::index_t K_Warp_Tile =
+        ck_tile::get_k_warp_tile<PrecType, M_Warp_Tile>();
+
+    static constexpr bool DoubleSmemBuffer          = false;
+    static constexpr ck_tile::GemmPipeline Pipeline = ck_tile::GemmPipeline::COMPUTE_V3;
+};
+
+template <typename PrecType>
+struct GemmConfigComputeV3_1 : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 256;
+    static constexpr ck_tile::index_t N_Tile = 256;
+    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 2;
+    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile =
+        ck_tile::get_k_warp_tile<PrecType, M_Warp_Tile>();
+
+    static constexpr bool DoubleSmemBuffer          = false;
+    static constexpr ck_tile::GemmPipeline Pipeline = ck_tile::GemmPipeline::COMPUTE_V3;
+};
+
+template <typename PrecType>
+struct GemmConfigComputeV3_2 : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 128;
+    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 2;
+    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 16;
+    static constexpr ck_tile::index_t N_Warp_Tile = 16;
+    static constexpr ck_tile::index_t K_Warp_Tile =
+        ck_tile::get_k_warp_tile<PrecType, M_Warp_Tile>();
+
+    static constexpr bool DoubleSmemBuffer          = false;
+    static constexpr ck_tile::GemmPipeline Pipeline = ck_tile::GemmPipeline::COMPUTE_V3;
+
+    static constexpr int kBlockPerCu = 2;
+};
+
+template <typename PrecType>
+struct GemmConfigComputeV3_WMMA : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 128;
+    static constexpr ck_tile::index_t K_Tile = 64 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 4;
+    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 16;
+    static constexpr ck_tile::index_t N_Warp_Tile = 16;
+    static constexpr ck_tile::index_t K_Warp_Tile = 16;
+
+    static constexpr bool DoubleSmemBuffer          = false;
+    static constexpr ck_tile::GemmPipeline Pipeline = ck_tile::GemmPipeline::COMPUTE_V3;
+
+    static constexpr int kBlockPerCu = 2;
+};
+
+template <typename PrecType>
+struct GemmConfigComputeV4 : public GemmConfigBase
+{
+    // Compute V4 only support Intrawave scheduler
+    // Using the ping pong reader in the lds level
+    static constexpr ck_tile::index_t M_Tile = 256;
+    static constexpr ck_tile::index_t N_Tile = 256;
+    static constexpr ck_tile::index_t K_Tile = 64 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 2;
+    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile =
+        ck_tile::get_k_warp_tile<PrecType, M_Warp_Tile>();
+
+    static constexpr bool DoubleSmemBuffer          = true;
+    static constexpr ck_tile::GemmPipeline Pipeline = ck_tile::GemmPipeline::COMPUTE_V4;
+};
+
+template <typename PrecType>
+struct GemmConfigComputeV4_1 : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 256;
+    static constexpr ck_tile::index_t N_Tile = 256;
+    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 2;
+    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile =
+        ck_tile::get_k_warp_tile<PrecType, M_Warp_Tile>();
+
+    static constexpr bool DoubleSmemBuffer          = true;
+    static constexpr ck_tile::GemmPipeline Pipeline = ck_tile::GemmPipeline::COMPUTE_V4;
+};
+
+template <typename PrecType>
+struct GemmConfigComputeV5 : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 128;
+    static constexpr ck_tile::index_t K_Tile = 64 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 1;
+    static constexpr ck_tile::index_t N_Warp = 1;
+    static constexpr ck_tile::index_t K_Warp = 2;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile =
+        ck_tile::get_k_warp_tile<PrecType, M_Warp_Tile>();
+
+    static constexpr bool DoubleSmemBuffer          = false;
+    static constexpr ck_tile::GemmPipeline Pipeline = ck_tile::GemmPipeline::COMPUTE_V5;
+    static constexpr ck_tile::index_t NumWaveGroups = 2;
+};
+
+template <typename PrecType>
+struct GemmConfigComputeV6 : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 256;
+    static constexpr ck_tile::index_t N_Tile = 256;
+    static constexpr ck_tile::index_t K_Tile = 32;
+
+    static constexpr ck_tile::index_t M_Warp = 2;
+    static constexpr ck_tile::index_t N_Warp = 2;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = 16;
+
+    static constexpr bool DoubleSmemBuffer          = false;
+    static constexpr ck_tile::GemmPipeline Pipeline = ck_tile::GemmPipeline::COMPUTE_V6;
+    static constexpr ck_tile::index_t NumWaveGroups = 1;
+};
+
+template <typename PrecType>
+struct GemmConfigComputeAsync : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 64;
+    static constexpr ck_tile::index_t N_Tile = 64;
+    static constexpr ck_tile::index_t K_Tile = 256;
+
+    static constexpr ck_tile::index_t M_Warp = 1;
+    static constexpr ck_tile::index_t N_Warp = 4;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 16;
+    static constexpr ck_tile::index_t N_Warp_Tile = 16;
+    static constexpr ck_tile::index_t K_Warp_Tile = 128;
+
+    static constexpr bool DoubleSmemBuffer          = true;
+    static constexpr ck_tile::GemmPipeline Pipeline = ck_tile::GemmPipeline::COMPUTE_ASYNC;
+    static constexpr ck_tile::index_t NumWaveGroups = 1;
+    static constexpr bool UseStructuredSparsity     = false;
+};
+
+template <typename PrecType>
+struct GemmConfigPreshuffleDecode : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 16;
+    static constexpr ck_tile::index_t N_Tile = 64;
+    static constexpr ck_tile::index_t K_Tile = 256 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 1;
+    static constexpr ck_tile::index_t N_Warp = 4;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 16;
+    static constexpr ck_tile::index_t N_Warp_Tile = 16;
+    static constexpr ck_tile::index_t K_Warp_Tile =
+        ck_tile::get_k_warp_tile<PrecType, M_Warp_Tile, true>();
+
+    static constexpr int kBlockPerCu                = 1;
+    static constexpr auto Scheduler                 = ck_tile::GemmPipelineScheduler::Default;
+    static constexpr ck_tile::GemmPipeline Pipeline = ck_tile::GemmPipeline::PRESHUFFLE_V2;
+    static constexpr bool Preshuffle                = true;
+    static constexpr bool DoubleSmemBuffer          = true;
+    static constexpr int N_Repeat                   = N_Tile / N_Warp_Tile / N_Warp;
+    static constexpr bool TiledMMAPermuteN          = N_Repeat % 2 == 0;
+};
+
+template <typename PrecType>
+struct GemmConfigPreshufflePrefill : public GemmConfigBase
+{
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 128;
+    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(PrecType);
+
+    static constexpr ck_tile::index_t M_Warp = 1;
+    static constexpr ck_tile::index_t N_Warp = 4;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 16;
+    static constexpr ck_tile::index_t N_Warp_Tile = 16;
+    static constexpr ck_tile::index_t K_Warp_Tile =
+        ck_tile::get_k_warp_tile<PrecType, M_Warp_Tile, true>();
+
+    static constexpr int kBlockPerCu                = 2;
+    static constexpr auto Scheduler                 = ck_tile::GemmPipelineScheduler::Default;
+    static constexpr ck_tile::GemmPipeline Pipeline = ck_tile::GemmPipeline::PRESHUFFLE_V2;
+    static constexpr bool Preshuffle                = true;
+    static constexpr bool DoubleSmemBuffer          = true;
+    static constexpr int N_Repeat                   = N_Tile / N_Warp_Tile / N_Warp;
+    static constexpr bool TiledMMAPermuteN          = N_Repeat % 2 == 0;
+};
+
+template <typename PrecType>
+struct GemmConfigPreshufflePrefill_Wmma : public GemmConfigPreshufflePrefill<PrecType>
+{
+    static constexpr ck_tile::index_t M_Warp_Tile = 16;
+    static constexpr ck_tile::index_t N_Warp_Tile = 16;
+    static constexpr ck_tile::index_t K_Warp_Tile = 16;
+};
+
+template <typename ADataType, typename BDataType = ADataType, typename CDataType = ADataType>
+struct GemmTypeConfig;
+
+template <>
+struct GemmTypeConfig<ck_tile::half_t>
+{
+    using ADataType   = ck_tile::half_t;
+    using BDataType   = ck_tile::half_t;
+    using AccDataType = float;
+    using CDataType   = ck_tile::half_t;
+    // ToDo: Add more bias config to support different categories of GEMM.
+};
+
+template <>
+struct GemmTypeConfig<ck_tile::bf16_t, ck_tile::bf16_t, ck_tile::bf16_t>
+{
+    using ADataType   = ck_tile::bf16_t;
+    using BDataType   = ck_tile::bf16_t;
+    using AccDataType = float;
+    using CDataType   = ck_tile::bf16_t;
+};
+
+template <>
+struct GemmTypeConfig<ck_tile::fp8_t, ck_tile::fp8_t, ck_tile::half_t>
+{
+    using ADataType   = ck_tile::fp8_t;
+    using BDataType   = ck_tile::fp8_t;
+    using AccDataType = float;
+    using CDataType   = ck_tile::half_t;
+};
+
+template <>
+struct GemmTypeConfig<ck_tile::bf8_t, ck_tile::bf8_t, ck_tile::half_t>
+{
+    using ADataType   = ck_tile::bf8_t;
+    using BDataType   = ck_tile::bf8_t;
+    using AccDataType = float;
+    using CDataType   = ck_tile::half_t;
+};
+
+template <>
+struct GemmTypeConfig<ck_tile::fp8_t, ck_tile::pk_int4_t, ck_tile::half_t>
+{
+    using ADataType   = ck_tile::fp8_t;
+    using BDataType   = ck_tile::pk_int4_t;
+    using AccDataType = float;
+    using CDataType   = ck_tile::half_t;
+};
+
+template <>
+struct GemmTypeConfig<ck_tile::bf8_t, ck_tile::pk_int4_t, ck_tile::half_t>
+{
+    using ADataType   = ck_tile::bf8_t;
+    using BDataType   = ck_tile::pk_int4_t;
+    using AccDataType = float;
+    using CDataType   = ck_tile::half_t;
+};
+
+template <>
+struct GemmTypeConfig<ck_tile::half_t, ck_tile::pk_int4_t, ck_tile::half_t>
+{
+    using ADataType   = ck_tile::half_t;
+    using BDataType   = ck_tile::pk_int4_t;
+    using AccDataType = float;
+    using CDataType   = ck_tile::half_t;
+};
+
+template <>
+struct GemmTypeConfig<ck_tile::int8_t, ck_tile::int8_t, int32_t>
+{
+    using ADataType   = ck_tile::int8_t;
+    using BDataType   = ck_tile::int8_t;
+    using AccDataType = int32_t;
+    using CDataType   = int32_t;
+};
+
+template <>
+struct GemmTypeConfig<ck_tile::pk_fp4_t, ck_tile::pk_fp4_t, ck_tile::half_t>
+{
+    using ADataType   = ck_tile::pk_fp4_t;
+    using BDataType   = ck_tile::pk_fp4_t;
+    using AccDataType = float;
+    using CDataType   = ck_tile::half_t;
+};
+
+template <ck_tile::GemmPipeline PipelineId>
+struct PipelineTypeTraits;
+
+template <>
+struct PipelineTypeTraits<ck_tile::GemmPipeline::MEMORY>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::GemmPipelineAgBgCrMem<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrMem<PipelineProblem>;
+};
+
+template <>
+struct PipelineTypeTraits<ck_tile::GemmPipeline::COMPUTE_V3>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV3<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV3<PipelineProblem>;
+};
+
+template <>
+struct PipelineTypeTraits<ck_tile::GemmPipeline::COMPUTE_V4>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV4<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV4<PipelineProblem>;
+};
+
+template <>
+struct PipelineTypeTraits<ck_tile::GemmPipeline::COMPUTE_V5>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV5<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV5<PipelineProblem>;
+};
+
+template <>
+struct PipelineTypeTraits<ck_tile::GemmPipeline::COMPUTE_V6>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV6<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV6<PipelineProblem>;
+};
+
+template <>
+struct PipelineTypeTraits<ck_tile::GemmPipeline::COMPUTE_ASYNC>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompAsync<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompAsync<PipelineProblem>;
+};
+
+template <>
+struct PipelineTypeTraits<ck_tile::GemmPipeline::PRESHUFFLE_V2>
+{
+    template <typename PipelineProblem>
+    using GemmPipeline = ck_tile::WeightPreshufflePipelineAGmemBGmemCRegV2<PipelineProblem>;
+    template <typename PipelineProblem>
+    using UniversalGemmPipeline =
+        ck_tile::BaseWeightPreshufflePipelineAGmemBGmemCRegV2<PipelineProblem>;
+};
+
+inline auto create_args()
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("m", "3840", "m dimension")
+        .insert("n", "4096", "n dimension")
+        .insert("k", "2048", "k dimension")
+        .insert("a_layout", "R", "A tensor data layout - Row by default")
+        .insert("b_layout", "C", "B tensor data layout - Column by default")
+        .insert("c_layout", "R", "C tensor data layout - Row by default")
+        .insert("stride_a", "0", "Tensor A stride")
+        .insert("stride_b", "0", "Tensor B stride")
+        .insert("stride_c", "0", "Tensor C stride")
+        .insert("v", "2", "0. No validation, 1. Validation on CPU, 2. Validation on GPU")
+        .insert("prec", "fp16", "data type. fp16/bf16/fp8/bf8/pk_int4_t")
+        .insert("warmup", "50", "number of iterations before benchmark the kernel")
+        .insert("repeat", "100", "number of iterations to benchmark the kernel")
+        .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
+        .insert("split_k", "1", "splitK value")
+        .insert("init", "0", "0:random, 1:linear, 2:constant(1)")
+        .insert("persistent", "0", "0:non-persistent, 1:persistent")
+        .insert("json", "0", "0: No Json, 1: Dump Results in Json format")
+        .insert("jsonfile", "gemm.json", "json file name to dump results")
+        .insert("flush_cache", "true", "flush cache before running the kernel, defaults to true")
+        .insert("rotating_count", "1000", "rotating count, defaults to 1000")
+        .insert("test_async", "0", "0: normal gemm, 1: test async input scheduler");
+    return arg_parser;
+}
+
+// host API
+template <typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename CLayout,
+          bool Persistent = false,
+          typename CDEElementWise>
+float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s);
--- a/example/ck_tile/03_gemm/gemm_weight_preshuffle.cpp
+++ b/example/ck_tile/03_gemm/gemm_weight_preshuffle.cpp
@@ -0,0 +1,112 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include <hip/hip_runtime.h>
+
+#include <cstring>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <tuple>
+
+#include "ck_tile/host.hpp"
+#include "gemm_utils.hpp"
+#include "run_gemm_example.inc"
+#include "gemm_weight_preshuffle_invoker.hpp"
+
+template <typename GemmConfig,
+          typename APrecType,
+          typename BPrecType = APrecType,
+          typename CPrecType = APrecType>
+int run_gemm_example_prec_type(std::string a_layout,
+                               std::string b_layout,
+                               ck_tile::ArgParser& arg_parser)
+{
+    using Row       = ck_tile::tensor_layout::gemm::RowMajor;
+    using Col       = ck_tile::tensor_layout::gemm::ColumnMajor;
+    bool preshuffle = GemmConfig::Preshuffle;
+    using Invoker   = WeightPreshuffleInvoker;
+
+    if(preshuffle && (a_layout != "R" || b_layout != "C"))
+    {
+        throw std::runtime_error(
+            "Preshuffle is supported only for A(Row major), B(column major) input matrices!");
+    }
+
+    if(a_layout == "R" && b_layout == "C")
+    {
+        return run_gemm_example_with_layouts<GemmConfig, Invoker, APrecType, BPrecType, CPrecType>(
+            arg_parser, Row{}, Col{}, Row{});
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported memory layout for the input matrices!");
+    }
+}
+
+template <template <typename PreType> typename GemmConfig>
+int run_gemm_example(ck_tile::ArgParser& arg_parser)
+{
+    std::string data_type = arg_parser.get_str("prec");
+    std::string a_layout  = arg_parser.get_str("a_layout");
+    std::string b_layout  = arg_parser.get_str("b_layout");
+
+    if(data_type == "fp16")
+    {
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::half_t>, ck_tile::half_t>(
+            a_layout, b_layout, arg_parser);
+    }
+    else if(data_type == "bf16")
+    {
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::half_t>, ck_tile::bf16_t>(
+            a_layout, b_layout, arg_parser);
+    }
+    else if(data_type == "fp8")
+    {
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::fp8_t>,
+                                          ck_tile::fp8_t,
+                                          ck_tile::fp8_t,
+                                          ck_tile::half_t>(a_layout, b_layout, arg_parser);
+    }
+    else if(data_type == "bf8")
+    {
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::bf8_t>,
+                                          ck_tile::bf8_t,
+                                          ck_tile::bf8_t,
+                                          ck_tile::half_t>(a_layout, b_layout, arg_parser);
+    }
+    else if(data_type == "int4")
+    {
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::fp8_t>,
+                                          ck_tile::fp8_t,
+                                          ck_tile::pk_int4_t,
+                                          ck_tile::half_t>(a_layout, b_layout, arg_parser);
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported data type for this operation !!!");
+    }
+}
+
+int main(int argc, char* argv[])
+{
+    auto arg_parser = create_args();
+    auto result     = arg_parser.parse(argc, argv);
+
+    if(!result)
+        return -1;
+
+    try
+    {
+#if CK_TILE_USE_WMMA
+        return !run_gemm_example<GemmConfigPreshufflePrefill_Wmma>(arg_parser);
+#else
+        return !run_gemm_example<GemmConfigPreshufflePrefill>(arg_parser);
+#endif
+    }
+    catch(const std::runtime_error& e)
+    {
+        std::cerr << "Caught runtime error: " << e.what() << '\n';
+        return EXIT_FAILURE;
+    }
+}
--- a/example/ck_tile/03_gemm/gemm_weight_preshuffle_invoker.hpp
+++ b/example/ck_tile/03_gemm/gemm_weight_preshuffle_invoker.hpp
@@ -0,0 +1,151 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+#pragma once
+#include "gemm_utils.hpp"
+
+struct WeightPreshuffleInvoker
+{
+    template <typename GemmConfig,
+              typename ADataType,
+              typename BDataType,
+              typename DsDataType,
+              typename AccDataType,
+              typename CDataType,
+              typename ALayout,
+              typename BLayout,
+              typename DsLayout,
+              typename ELayout,
+              bool Persistent,
+              typename CDEElementWise>
+    static float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
+
+    {
+        using GemmShape = ck_tile::TileGemmShape<
+            ck_tile::sequence<GemmConfig::M_Tile, GemmConfig::N_Tile, GemmConfig::K_Tile>,
+            ck_tile::sequence<GemmConfig::M_Warp, GemmConfig::N_Warp, GemmConfig::K_Warp>,
+            ck_tile::
+                sequence<GemmConfig::M_Warp_Tile, GemmConfig::N_Warp_Tile, GemmConfig::K_Warp_Tile>,
+            GemmConfig::PermuteA,
+            GemmConfig::PermuteB>;
+
+        using TilePartitioner =
+            ck_tile::GemmSpatiallyLocalTilePartitioner<GemmShape,
+                                                       GemmConfig::TileParitionerGroupNum,
+                                                       GemmConfig::TileParitionerM01>;
+
+        using GemmUniversalTraits =
+            ck_tile::TileGemmUniversalTraits<GemmConfig::kPadM,
+                                             GemmConfig::kPadN,
+                                             GemmConfig::kPadK,
+                                             GemmConfig::DoubleSmemBuffer,
+                                             ALayout,
+                                             BLayout,
+                                             ELayout,
+                                             GemmConfig::TransposeC,
+                                             GemmConfig::UseStructuredSparsity,
+                                             Persistent,
+                                             GemmConfig::NumWaveGroups,
+                                             GemmConfig::Preshuffle>;
+        constexpr auto scheduler = GemmConfig::Scheduler;
+
+        using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                                           BDataType,
+                                                                           AccDataType,
+                                                                           GemmShape,
+                                                                           GemmUniversalTraits,
+                                                                           scheduler>;
+
+        using GemmPipeline = typename PipelineTypeTraits<
+            GemmConfig::Pipeline>::template GemmPipeline<UniversalGemmProblem>;
+
+        using GemmEpilogue = ck_tile::CShuffleEpilogue<
+            ck_tile::CShuffleEpilogueProblem<ADataType,
+                                             BDataType,
+                                             DsDataType,
+                                             AccDataType,
+                                             CDataType,
+                                             DsLayout,
+                                             ELayout,
+                                             CDEElementWise,
+                                             TilePartitioner::MPerBlock,
+                                             TilePartitioner::NPerBlock,
+                                             GemmConfig::M_Warp,
+                                             GemmConfig::N_Warp,
+                                             GemmConfig::M_Warp_Tile,
+                                             GemmConfig::N_Warp_Tile,
+                                             GemmConfig::K_Warp_Tile,
+                                             UniversalGemmProblem::TransposeC,
+                                             GemmConfig::NumWaveGroups,
+                                             false,
+                                             1,
+                                             GemmConfig::TiledMMAPermuteN>>;
+        using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
+        auto kargs   = Kernel::MakeKernelArgs(args);
+
+        dim3 grids;
+        if constexpr(Persistent)
+        {
+            grids = Kernel::MaxOccupancyGridSize(s);
+        }
+        else
+        {
+            grids = Kernel::GridSize(args.M, args.N, args.k_batch);
+        }
+        dim3 blocks = Kernel::BlockSize();
+
+        if(!Kernel::IsSupportedArgument(kargs))
+        {
+            throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
+        }
+
+        if(s.log_level_ > 0)
+        {
+            std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
+                      << "shape: " << GemmShape::GetName() << '\n'
+                      << "problem: " << UniversalGemmProblem::GetName() << '\n'
+                      << "pipeline: " << GemmPipeline::GetName() << '\n'
+                      << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
+                      << ", kBlockPerCu: {" << GemmConfig::kBlockPerCu << "}" << std::endl;
+        }
+        float ave_time = 0.f;
+        if(s.flush_cache_)
+        {
+            std::cout << "Flushing cache..." << std::endl;
+
+            ck_tile::HostTensor<ADataType> a_m(ck_tile::host_tensor_descriptor(
+                args.M, args.K, args.stride_A, is_row_major(ALayout{})));
+            ck_tile::HostTensor<BDataType> b_n(ck_tile::host_tensor_descriptor(
+                args.K, args.N, args.stride_B, is_row_major(BLayout{})));
+
+            auto size_a_buffer = a_m.get_element_space_size_in_bytes();
+            auto size_b_buffer = b_n.get_element_space_size_in_bytes();
+
+            ck_tile::RotatingMemWrapper<ADataType, BDataType> rotating_mem(
+                kargs.as_ptr[0], kargs.bs_ptr[0], s.rotating_count_, size_a_buffer, size_b_buffer);
+            rotating_mem.Print();
+
+            auto run_flush_cache = [&]() {
+                // flush icache
+                ck_tile::flush_icache();
+                // rotating mem
+                rotating_mem.Next();
+                // clear c mem
+                if(args.k_batch > 1)
+                    hipGetErrorString(hipMemsetAsync(
+                        args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
+            };
+            ave_time = ck_tile::launch_kernel_time_mask(
+                s,
+                run_flush_cache,
+                ck_tile::make_kernel<GemmConfig::kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+        }
+        else
+        {
+            ave_time = ck_tile::launch_kernel(
+                s,
+                ck_tile::make_kernel<GemmConfig::kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+        }
+        return ave_time;
+    }
+};
--- a/example/ck_tile/03_gemm/run_gemm_example.inc
+++ b/example/ck_tile/03_gemm/run_gemm_example.inc
@@ -0,0 +1,466 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+#include "ck_tile/host/permute_pk_int4.hpp"
+#include "ck_tile/host/tensor_shuffle_utils.hpp"
+#include "ck_tile/ops/common/utils.hpp"
+
+template <typename Layout>
+static constexpr inline auto is_row_major(Layout layout_)
+{
+    return ck_tile::bool_constant<std::is_same_v<ck_tile::remove_cvref_t<decltype(layout_)>,
+                                                 ck_tile::tensor_layout::gemm::RowMajor>>{};
+}
+
+template <typename ADataType, typename BDataType, typename AccDataType, typename CDataType>
+auto calculate_rtol_atol(const ck_tile::index_t K,
+                         const ck_tile::index_t kbatch,
+                         const float max_accumulated_value)
+{
+    using ComputeType =
+        std::conditional_t<sizeof(ADataType) < sizeof(BDataType), ADataType, BDataType>;
+    // Calculate thresholds
+    const auto rtol = ck_tile::get_relative_threshold<ComputeType, CDataType, AccDataType>(
+        ck_tile::integer_divide_ceil(K, kbatch));
+    const auto atol = ck_tile::get_absolute_threshold<ComputeType, CDataType, AccDataType>(
+        max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch));
+    // Calculate error due to split_k accumulation
+    const auto rtol_split_k =
+        ck_tile::get_relative_threshold<CDataType, CDataType, CDataType>(kbatch);
+    const auto atol_split_k = ck_tile::get_absolute_threshold<CDataType, CDataType, CDataType>(
+        max_accumulated_value, kbatch);
+    return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
+}
+
+template <typename GemmConfig,
+          typename Tensor,
+          typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout>
+void permute_tensor_b(Tensor& tensor)
+{
+    using GemmShape = ck_tile::TileGemmShape<
+        ck_tile::sequence<GemmConfig::M_Tile, GemmConfig::N_Tile, GemmConfig::K_Tile>,
+        ck_tile::sequence<GemmConfig::M_Warp, GemmConfig::N_Warp, GemmConfig::K_Warp>,
+        ck_tile::
+            sequence<GemmConfig::M_Warp_Tile, GemmConfig::N_Warp_Tile, GemmConfig::K_Warp_Tile>,
+        GemmConfig::PermuteA,
+        GemmConfig::PermuteB>;
+
+    using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<GemmConfig::kPadM,
+                                                                 GemmConfig::kPadN,
+                                                                 GemmConfig::kPadK,
+                                                                 GemmConfig::DoubleSmemBuffer,
+                                                                 ALayout,
+                                                                 BLayout,
+                                                                 CLayout,
+                                                                 GemmConfig::TransposeC,
+                                                                 GemmConfig::UseStructuredSparsity>;
+
+    using UniversalGemmProblem =
+        ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                              BDataType,
+                                              AccDataType,
+                                              GemmShape,
+                                              GemmUniversalTraits,
+                                              GemmConfig::Scheduler,
+                                              ck_tile::element_wise::PassThrough,
+                                              ck_tile::element_wise::PassThrough,
+                                              ADataType,
+                                              true>;
+
+    using GemmPipeline = typename PipelineTypeTraits<GemmConfig::Pipeline>::template GemmPipeline<
+        UniversalGemmProblem>;
+
+    const ck_tile::index_t K  = tensor.get_length(0);
+    const ck_tile::index_t N  = tensor.get_length(1);
+    const ck_tile::index_t K1 = GemmPipeline::GetSmemPackB();
+    const ck_tile::index_t K0 = K / K1;
+
+    Tensor tensor_copy = tensor;
+
+    // int K0, N, K1
+    for(int j = 0; j < K0; j++)
+    {
+        for(int i = 0; i < N; i++)
+        {
+            for(int jj = 0; jj < K1; jj++)
+            {
+                tensor(j * N * K1 + i * K1 + jj) = tensor_copy(i * K + (j * K1 + jj));
+            }
+        }
+    }
+}
+
+template <typename GemmConfig,
+          typename Invoker,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename CLayout,
+          typename CDEElementWise = ck_tile::element_wise::PassThrough>
+float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
+                  ck_tile::DeviceMem& b_k_n_dev_buf,
+                  ck_tile::DeviceMem& c_m_n_dev_buf,
+                  ck_tile::index_t M,
+                  ck_tile::index_t N,
+                  ck_tile::index_t K,
+                  ck_tile::index_t stride_A,
+                  ck_tile::index_t stride_B,
+                  ck_tile::index_t stride_C,
+                  ck_tile::index_t kbatch,
+                  int n_warmup,
+                  int n_repeat,
+                  bool persistent,
+                  bool flush_cache,
+                  int rotating_count)
+{
+    ck_tile::GemmHostArgs args = {a_m_k_dev_buf.GetDeviceBuffer(),
+                                  b_k_n_dev_buf.GetDeviceBuffer(),
+                                  c_m_n_dev_buf.GetDeviceBuffer(),
+                                  kbatch,
+                                  M,
+                                  N,
+                                  K,
+                                  stride_A,
+                                  stride_B,
+                                  stride_C};
+
+    float ave_time;
+    if(persistent)
+    {
+        ave_time = Invoker::template gemm<GemmConfig,
+                                          ADataType,
+                                          BDataType,
+                                          DsDataType,
+                                          AccDataType,
+                                          CDataType,
+                                          ALayout,
+                                          BLayout,
+                                          DsLayout,
+                                          CLayout,
+                                          true,
+                                          CDEElementWise>(
+            args,
+            ck_tile::stream_config{
+                nullptr, true, 1, n_warmup, n_repeat, true, flush_cache, rotating_count});
+    }
+    else
+    {
+        ave_time = Invoker::template gemm<GemmConfig,
+                                          ADataType,
+                                          BDataType,
+                                          DsDataType,
+                                          AccDataType,
+                                          CDataType,
+                                          ALayout,
+                                          BLayout,
+                                          DsLayout,
+                                          CLayout,
+                                          false,
+                                          CDEElementWise>(
+            args,
+            ck_tile::stream_config{
+                nullptr, true, 1, n_warmup, n_repeat, true, flush_cache, rotating_count});
+    }
+
+    return ave_time;
+}
+
+template <typename CDataType>
+bool do_verify(const ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
+               const ck_tile::HostTensor<CDataType>& c_m_n_ref,
+               const ck_tile::tuple<double, double>& rtol_atol,
+               const char* variant)
+{
+    bool pass = ck_tile::check_err(c_m_n_dev_result,
+                                   c_m_n_ref,
+                                   "Error: Incorrect results!",
+                                   rtol_atol.at(ck_tile::number<0>{}),
+                                   rtol_atol.at(ck_tile::number<1>{}));
+
+    std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
+              << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{}) << std::endl;
+    std::cout << "The " << variant << " verification result is:" << (pass ? "correct" : "fail")
+              << std::endl;
+    return pass;
+}
+
+std::tuple<ck_tile::index_t, ck_tile::index_t, ck_tile::index_t> inline parse_gemm_size(
+    ck_tile::ArgParser& arg_parser)
+{
+    ck_tile::index_t M = arg_parser.get_int("m");
+    ck_tile::index_t N = arg_parser.get_int("n");
+    ck_tile::index_t K = arg_parser.get_int("k");
+    return std::make_tuple(M, N, K);
+}
+
+template <typename GemmConfig,
+          typename Invoker,
+          typename ADataType,
+          typename BDataType = ADataType,
+          typename CDataType = ADataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout>
+int run_gemm_example_with_layouts(ck_tile::ArgParser& arg_parser,
+                                  const ALayout a_layout                  = ALayout{},
+                                  const BLayout b_layout                  = BLayout{},
+                                  [[maybe_unused]] const CLayout c_layout = CLayout{})
+{
+    using AccDataType = typename GemmTypeConfig<ADataType, BDataType, CDataType>::AccDataType;
+
+    ck_tile::index_t M = arg_parser.get_int("m");
+    ck_tile::index_t N = arg_parser.get_int("n");
+    ck_tile::index_t K = arg_parser.get_int("k");
+
+    ck_tile::index_t stride_A = arg_parser.get_int("stride_a");
+    ck_tile::index_t stride_B = arg_parser.get_int("stride_b");
+    ck_tile::index_t stride_C = arg_parser.get_int("stride_c");
+
+    ck_tile::index_t kbatch      = arg_parser.get_int("split_k");
+    int n_warmup                 = arg_parser.get_int("warmup");
+    int n_repeat                 = arg_parser.get_int("repeat");
+    ck_tile::index_t init_method = arg_parser.get_int("init");
+    bool persistent              = arg_parser.get_int("persistent");
+    bool flush_cache             = arg_parser.get_bool("flush_cache");
+    int rotating_count           = arg_parser.get_int("rotating_count");
+
+    const bool preshuffle = GemmConfig::Preshuffle;
+
+    stride_A = ck_tile::get_default_stride(M, K, stride_A, is_row_major(a_layout));
+    stride_B = ck_tile::get_default_stride(K, N, stride_B, is_row_major(b_layout));
+    stride_C = ck_tile::get_default_stride(M, N, stride_C, is_row_major(CLayout{}));
+
+    ck_tile::HostTensor<ADataType> a_m_k(
+        ck_tile::host_tensor_descriptor(M, K, stride_A, is_row_major(a_layout)));
+    ck_tile::HostTensor<BDataType> b_k_n(
+        ck_tile::host_tensor_descriptor(K, N, stride_B, is_row_major(b_layout)));
+    ck_tile::HostTensor<CDataType> c_m_n_dev_result(
+        ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
+
+    if(init_method == 0)
+    {
+        ck_tile::FillUniformDistribution<ADataType>{-2.f, 2.f}(a_m_k);
+        ck_tile::FillUniformDistribution<BDataType>{-2.f, 2.f}(b_k_n);
+    }
+    else if(init_method == 1)
+    {
+        ck_tile::FillMonotonicSeq<ADataType>{}(a_m_k);
+        ck_tile::FillMonotonicSeq<BDataType>{}(b_k_n);
+    }
+    else if(init_method == 2)
+    {
+        ck_tile::FillUniformDistribution<ADataType>{1.f, 1.f}(a_m_k);
+        ck_tile::FillUniformDistribution<BDataType>{1.f, 1.f}(b_k_n);
+    }
+    else
+    {
+        a_m_k.SetZero();
+        b_k_n.SetZero();
+    }
+
+    if(!preshuffle && GemmConfig::UseStructuredSparsity)
+    {
+        ck_tile::AdjustToStructuredSparsity<ADataType>{}(a_m_k);
+    }
+
+    ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes());
+
+    static_assert(!GemmConfig::PermuteA, "Not implemented");
+
+    if constexpr(preshuffle)
+    {
+        ck_tile::HostTensor<BDataType> b_shuffle_host = [&]() {
+            if constexpr(GemmConfig::TiledMMAPermuteN)
+            {
+                std::cout << "Run with PermuteN" << std::endl;
+                return ck_tile::shuffle_b_permuteN<GemmConfig>(b_k_n);
+            }
+            else
+            {
+                std::cout << "Run without PermuteN" << std::endl;
+                return ck_tile::shuffle_b<GemmConfig>(b_k_n);
+            }
+        }();
+        // shuffled buffer B for device implementation
+        if constexpr(std::is_same_v<BDataType, ck_tile::pk_int4_t>)
+        {
+            ck_tile::permute_vectors_i4x4_b(b_shuffle_host);
+        }
+        b_k_n_dev_buf.ToDevice(b_shuffle_host.data());
+    }
+    else
+    {
+        if constexpr(std::is_same_v<BDataType, ck_tile::pk_int4_t>)
+        {
+            // Permute vector pk_i4x4 data for device implementation
+            ck_tile::HostTensor<BDataType> b_k_n_dev = b_k_n;
+            if constexpr(GemmConfig::PermuteB)
+            {
+                permute_tensor_b<GemmConfig,
+                                 decltype(b_k_n_dev),
+                                 ADataType,
+                                 BDataType,
+                                 AccDataType,
+                                 CDataType,
+                                 ALayout,
+                                 BLayout,
+                                 CLayout>(b_k_n_dev);
+            }
+            ck_tile::permute_vectors_i4x4_b(b_k_n_dev);
+            b_k_n_dev_buf.ToDevice(b_k_n_dev.data());
+        }
+        else
+        {
+            if constexpr(GemmConfig::PermuteB)
+            {
+                std::cout << "Permute for this DataType is not implemented." << std::endl;
+                return false;
+            }
+            b_k_n_dev_buf.ToDevice(b_k_n.data());
+        }
+    }
+
+    a_m_k_dev_buf.ToDevice(a_m_k.data());
+    c_m_n_dev_buf.SetZero();
+    c_m_n_dev_result.SetZero();
+
+    float ave_time = invoke_gemm<GemmConfig,
+                                 Invoker,
+                                 ADataType,
+                                 BDataType,
+                                 ck_tile::tuple<>,
+                                 AccDataType,
+                                 CDataType,
+                                 ALayout,
+                                 BLayout,
+                                 ck_tile::tuple<>,
+                                 CLayout>(a_m_k_dev_buf,
+                                          b_k_n_dev_buf,
+                                          c_m_n_dev_buf,
+                                          M,
+                                          N,
+                                          K,
+                                          stride_A,
+                                          stride_B,
+                                          stride_C,
+                                          kbatch,
+                                          n_warmup,
+                                          n_repeat,
+                                          persistent,
+                                          flush_cache,
+                                          rotating_count);
+
+    c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
+
+    std::size_t flop = std::size_t(2) * M * N * K;
+    std::size_t num_byte =
+        sizeof(ADataType) * M * K / ck_tile::numeric_traits<ADataType>::PackedSize +
+        sizeof(BDataType) * N * K / ck_tile::numeric_traits<BDataType>::PackedSize +
+        sizeof(CDataType) * M * N;
+    float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec = num_byte / 1.E6 / ave_time;
+
+    std::cout << "Run Gemm kernel with M=" << M << " N=" << N << " K=" << K
+              << " StrideA=" << stride_A << " StrideB=" << stride_B << " StrideC=" << stride_C
+              << " A_Layout=" << ALayout::name << " B_Layout =" << BLayout::name
+              << " C_Layout=" << CLayout::name
+              << " A_Type=" << ck_tile::DataTypeTraits<ADataType>::name
+              << " B_Type=" << ck_tile::DataTypeTraits<BDataType>::name
+              << " C_Type=" << ck_tile::DataTypeTraits<CDataType>::name
+              << " StructuredSparsity=" << (GemmConfig::UseStructuredSparsity ? "on" : "off")
+              << " Persistent=" << (persistent ? "on" : "off") << " : " << ave_time << " ms, "
+              << tflops << " TFlops, " << gb_per_sec << " GB/s, " << std::endl;
+
+    bool pass = true;
+
+    // memory on host to store gpu reference result
+    ck_tile::HostTensor<CDataType> c_m_n_ref(
+        ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
+    c_m_n_ref.SetZero();
+
+    if(arg_parser.get_int("v") == 1)
+    {
+        ck_tile::reference_gemm<ADataType, BDataType, AccDataType, CDataType>(
+            a_m_k, b_k_n, c_m_n_ref);
+        const float max_accumulated_value =
+            *std::max_element(c_m_n_ref.mData.begin(), c_m_n_ref.mData.end());
+        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
+            K, kbatch, max_accumulated_value);
+        pass = do_verify(c_m_n_dev_result, c_m_n_ref, rtol_atol, "CPU");
+    }
+    else if(arg_parser.get_int("v") == 2)
+    {
+        if constexpr(std::is_same_v<BDataType, ck_tile::pk_int4_t>)
+        {
+            // Restore input for B for gpu reference
+            b_k_n_dev_buf.ToDevice(b_k_n.data());
+        }
+        if constexpr(GemmConfig::Preshuffle)
+        {
+            b_k_n_dev_buf.ToDevice(b_k_n.data());
+        }
+
+        // memory on device to store gpu reference result
+        ck_tile::DeviceMem c_m_n_gpu_buf_ref(c_m_n_ref.get_element_space_size_in_bytes());
+        c_m_n_gpu_buf_ref.SetZero();
+
+        ADataType* d_A = static_cast<ADataType*>(a_m_k_dev_buf.GetDeviceBuffer());
+        BDataType* d_B = static_cast<BDataType*>(b_k_n_dev_buf.GetDeviceBuffer());
+        CDataType* d_C = static_cast<CDataType*>(c_m_n_gpu_buf_ref.GetDeviceBuffer());
+
+        ck_tile::reference_gemm_gpu<ADataType,
+                                    BDataType,
+                                    AccDataType,
+                                    CDataType,
+                                    ALayout,
+                                    BLayout,
+                                    CLayout>(d_A, d_B, d_C, M, N, K, stride_A, stride_B, stride_C);
+
+        c_m_n_gpu_buf_ref.FromDevice(c_m_n_ref.data());
+
+        const float max_accumulated_value =
+            *std::max_element(c_m_n_ref.mData.begin(), c_m_n_ref.mData.end());
+        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
+            K, kbatch, max_accumulated_value);
+        pass = do_verify(c_m_n_dev_result, c_m_n_ref, rtol_atol, "GPU");
+    }
+
+    if(arg_parser.get_int("json") == 1)
+    {
+        dump_gemm_json_results<ALayout,
+                               BLayout,
+                               CLayout,
+                               ADataType,
+                               BDataType,
+                               CDataType,
+                               GemmConfig,
+                               ck_tile::DataTypeTraits>(arg_parser.get_str("jsonfile"),
+                                                        M,
+                                                        N,
+                                                        K,
+                                                        stride_A,
+                                                        stride_B,
+                                                        stride_C,
+                                                        persistent,
+                                                        pass,
+                                                        ave_time,
+                                                        tflops,
+                                                        gb_per_sec);
+    }
+
+    return pass;
+}
--- a/example/ck_tile/03_gemm/run_gemm_example_common.hpp
+++ b/example/ck_tile/03_gemm/run_gemm_example_common.hpp
@@ -0,0 +1,63 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+#pragma once
+#include "gemm_utils.hpp"
+
+template <typename GemmConfig,
+          typename Invoker,
+          typename APrecType,
+          typename BPrecType = APrecType,
+          typename CPrecType = APrecType>
+int run_gemm_example_prec_type(std::string a_layout,
+                               std::string b_layout,
+                               ck_tile::ArgParser& arg_parser)
+{
+    using Row       = ck_tile::tensor_layout::gemm::RowMajor;
+    using Col       = ck_tile::tensor_layout::gemm::ColumnMajor;
+    bool preshuffle = GemmConfig::Preshuffle;
+
+    if(preshuffle && std::is_same_v<BPrecType, ck_tile::pk_int4_t>)
+    {
+        throw std::runtime_error("Preshuffle is not supported for this int4 datatype!");
+    }
+
+    if(preshuffle && a_layout != "R" && b_layout != "C")
+    {
+        throw std::runtime_error(
+            "Preshuffle is supported only for A(Row major), B(column major) input matrices!");
+    }
+
+    using LayoutVariant = std::variant<Row, Col>;
+
+    auto string_to_layout = [](const std::string& layout) -> LayoutVariant {
+        if(layout == "R")
+            return Row{};
+        if(layout == "C")
+            return Col{};
+        throw std::runtime_error("Unsupported layout: " + layout);
+    };
+
+    auto a_layout_variant = string_to_layout(a_layout);
+    auto b_layout_variant = string_to_layout(b_layout);
+
+    return std::visit(
+        [&](auto a_layout_type, auto b_layout_type) -> int {
+            if constexpr(std::is_same_v<BPrecType, ck_tile::pk_int4_t> &&
+                         std::is_same_v<decltype(b_layout_type), Row>)
+            {
+                throw std::runtime_error("Unsupported memory layout for the input matrices when "
+                                         "BPrecType is ck_tile::pk_int4_t!");
+            }
+            else
+            {
+                return run_gemm_example_with_layouts<GemmConfig,
+                                                     Invoker,
+                                                     APrecType,
+                                                     BPrecType,
+                                                     CPrecType>(
+                    arg_parser, a_layout_type, b_layout_type, Row{});
+            }
+        },
+        a_layout_variant,
+        b_layout_variant);
+}
--- a/example/ck_tile/03_gemm/script/benchmark_basic_bf16.sh
+++ b/example/ck_tile/03_gemm/script/benchmark_basic_bf16.sh
@@ -0,0 +1,17 @@
+#!/bin/sh
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+EXE="$(find . -name tile_example_gemm_basic -type f | head -n 1)"
+VALID=1
+
+
+for b_matrix_layout in "C"; do
+    for m in "64" "512" "1024" "2048"; do
+        for n in "512" "1024" "2048"; do
+            for k in "64" "512" "1024" "2048"; do
+                $EXE -prec=bf16 -m=$m -n=$n -k=$k -a_layout="R" -b_layout="$b_matrix_layout" -c_layout="R" -v=$VALID
+            done
+        done
+    done
+done
--- a/example/ck_tile/03_gemm/script/benchmark_basic_bf8.sh
+++ b/example/ck_tile/03_gemm/script/benchmark_basic_bf8.sh
@@ -0,0 +1,17 @@
+#!/bin/sh
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+EXE="$(find . -name tile_example_gemm_basic -type f | head -n 1)"
+VALID=1
+
+
+for b_matrix_layout in "C"; do
+    for m in "64" "512" "1024" "2048"; do
+        for n in "512" "1024" "2048"; do
+            for k in "64" "512" "1024" "2048"; do
+                $EXE -prec=bf8 -m=$m -n=$n -k=$k -a_layout="R" -b_layout="$b_matrix_layout" -c_layout="R" -v=$VALID
+            done
+        done
+    done
+done
--- a/example/ck_tile/03_gemm/script/benchmark_basic_fp16.sh
+++ b/example/ck_tile/03_gemm/script/benchmark_basic_fp16.sh
@@ -0,0 +1,16 @@
+#!/bin/sh
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+EXE="$(find . -name tile_example_gemm_basic -type f | head -n 1)"
+VALID=1
+
+for b_matrix_layout in "C"; do
+    for m in "64" "512" "1024" "2048"; do
+        for n in "512" "1024" "2048"; do
+            for k in "64" "512" "1024" "2048"; do
+                $EXE -prec=fp16 -m=$m -n=$n -k=$k -a_layout="R" -b_layout="$b_matrix_layout" -c_layout="R" -v=$VALID
+            done
+        done
+    done
+done
--- a/example/ck_tile/03_gemm/script/benchmark_basic_fp8.sh
+++ b/example/ck_tile/03_gemm/script/benchmark_basic_fp8.sh
@@ -0,0 +1,17 @@
+#!/bin/sh
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+EXE="$(find . -name tile_example_gemm_basic -type f | head -n 1)"
+VALID=1
+
+
+for b_matrix_layout in "C"; do
+    for m in "64" "512" "1024" "2048"; do
+        for n in "512" "1024" "2048"; do
+            for k in "64" "512" "1024" "2048"; do
+                $EXE -prec=fp8 -m=$m -n=$n -k=$k -a_layout="R" -b_layout="$b_matrix_layout" -c_layout="R" -v=$VALID
+            done
+        done
+    done
+done
--- a/example/ck_tile/03_gemm/script/benchmark_mem_pipeline_bf16.sh
+++ b/example/ck_tile/03_gemm/script/benchmark_mem_pipeline_bf16.sh
@@ -0,0 +1,16 @@
+#!/bin/sh
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+EXE="$(find . -name tile_example_gemm_universal -type f | head -n 1)"
+VALID=1
+
+for b_matrix_layout in "C"; do
+    for m in "512" "1024" "2048" "4096"; do
+        for n in "512" "1024" "2048"; do
+            for k in "512" "1024" "2048"; do
+                $EXE -prec=bf16 -m=$m -n=$n -k=$k -a_layout="R" -b_layout="$b_matrix_layout" -c_layout="R" -v=$VALID
+            done
+        done
+    done
+done
--- a/example/ck_tile/03_gemm/script/benchmark_mem_pipeline_bf8.sh
+++ b/example/ck_tile/03_gemm/script/benchmark_mem_pipeline_bf8.sh
@@ -0,0 +1,16 @@
+#!/bin/sh
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+EXE="$(find . -name tile_example_gemm_universal -type f | head -n 1)"
+VALID=1
+
+for b_matrix_layout in "C"; do
+    for m in "512" "1024" "2048" "4096"; do
+        for n in "512" "1024" "2048"; do
+            for k in "512" "1024" "2048"; do
+                $EXE -prec=bf8 -m=$m -n=$n -k=$k -a_layout="R" -b_layout="$b_matrix_layout" -c_layout="R" -v=$VALID
+            done
+        done
+    done
+done
--- a/example/ck_tile/03_gemm/script/benchmark_mem_pipeline_fp16.sh
+++ b/example/ck_tile/03_gemm/script/benchmark_mem_pipeline_fp16.sh
@@ -0,0 +1,16 @@
+#!/bin/sh
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+EXE="$(find . -name tile_example_gemm_universal -type f | head -n 1)"
+VALID=1
+
+for b_matrix_layout in "C"; do
+    for m in "512" "1024" "2048" "4096"; do
+        for n in "512" "1024" "2048"; do
+            for k in "512" "1024" "2048"; do
+                $EXE -prec=fp16 -m=$m -n=$n -k=$k -a_layout="R" -b_layout="$b_matrix_layout" -c_layout="R" -v=$VALID
+            done
+        done
+    done
+done
--- a/example/ck_tile/03_gemm/script/benchmark_mem_pipeline_fp8.sh
+++ b/example/ck_tile/03_gemm/script/benchmark_mem_pipeline_fp8.sh
@@ -0,0 +1,16 @@
+#!/bin/sh
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+EXE="$(find . -name tile_example_gemm_universal -type f | head -n 1)"
+VALID=1
+
+for b_matrix_layout in "C"; do
+    for m in "512" "1024" "2048" "4096"; do
+        for n in "512" "1024" "2048"; do
+            for k in "512" "1024" "2048"; do
+                $EXE -prec=fp8 -m=$m -n=$n -k=$k -a_layout="R" -b_layout="$b_matrix_layout" -c_layout="R" -v=$VALID
+            done
+        done
+    done
+done
--- a/example/ck_tile/03_gemm/script/run_full_test.sh
+++ b/example/ck_tile/03_gemm/script/run_full_test.sh
@@ -0,0 +1,48 @@
+#!/bin/bash 
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+#
+# in order to run this script you'd first need to build the tile_example_gemm executables in ../build/bin/
+#
+# run the script as "./run_full_test.sh <tag for your test environment> <branch name> <host name> <gpu_arch>
+# input arguments: 
+# environment tag  : a string describing the specifics of your test environment
+# branch name      : name of the branch in git repo (git status | grep -e 'On branch')
+# host name        : $hostname
+# gpu architecture: e.g., gfx90a, or gfx942, etc.
+
+# get the command line arguments:
+export env_type=$1
+echo 'Environment type: ' $env_type
+export branch=$2
+echo 'Branch name: ' $branch
+export host_name=$3
+echo 'Host name: ' $host_name
+export GPU_arch=$4
+echo 'GPU_arch: ' $GPU_arch
+
+function print_log_header(){
+    rm -f $1;
+    echo 'On branch ' $3 &> $1;
+    echo 'Node name: ' $4 >> $1;
+    # get GPU architecture and compute units from rocminfo
+    echo -n "GPU_arch: " >> $1; rocminfo | grep "Name:" | grep "gfx" >> $1;
+    rocminfo | grep "Compute Unit:" >> $1;
+    hipcc --version | grep -e 'HIP version'  >> $1;
+    echo 'Environment type: ' $2 >> $1;
+    /opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> $1;
+}
+
+# run verification tests
+for dtype in fp16 bf16 fp8 bf8; do
+    example/ck_tile/03_gemm/script/benchmark_basic_$dtype.sh
+done
+example/ck_tile/03_gemm/script/smoke_test_mem_pipeline.sh
+
+# run performance benchmarks
+for dtype in fp16 bf16 fp8 bf8; do
+    export gemm_log="perf_tile_gemm_mem_pipeline_${dtype}_${GPU_arch}.log"
+    print_log_header $gemm_log $env_type $branch $host_name
+    example/ck_tile/03_gemm/script/benchmark_mem_pipeline_$dtype.sh 2>&1 | tee -a $gemm_log
+done
--- a/example/ck_tile/03_gemm/script/smoke_test_basic.sh
+++ b/example/ck_tile/03_gemm/script/smoke_test_basic.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+EXE="$(find . -name tile_example_gemm_basic -type f | head -n 1)"
+KNAME=1
+
+export CK_WARMUP=0
+export CK_REPEAT=1
+
+COMMON_ARGS='-v=2 -warmup=0 -repeat=1'
+
+run_tests() {
+    for m in 128 1024; do
+        for n in 128 2048; do
+            for k in 64 128; do
+
+                $EXE -m=$m -n=$n -k=$k -stride_a=0 -stride_b=0 -stride_c=0 -prec=$1 $COMMON_ARGS
+                if [ $? -eq 0 ]; then
+                    echo "Success: Test with m=$m, n=$n, k=$k executed successfully."
+                else
+                    echo "Error: Test with m=$m, n=$n, k=$k failed to execute properly."
+                    # Optionally, exit or break if you need to halt further execution
+                    # exit 1
+                fi
+
+            done
+        done
+    done
+}
+
+set -x
+
+run_tests "fp16"
+run_tests "bf16"
+run_tests "fp8"
+run_tests "bf8"
+
+set +x
--- a/example/ck_tile/03_gemm/script/smoke_test_mem_pipeline.sh
+++ b/example/ck_tile/03_gemm/script/smoke_test_mem_pipeline.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+EXE="$(find . -name tile_example_gemm_universal -type f | head -n 1)"
+KNAME=1
+
+export CK_WARMUP=0
+export CK_REPEAT=1
+
+COMMON_ARGS='-v=1 -warmup=0 -repeat=1'
+
+run_tests() {
+    for m in 512 1024; do
+        for n in 512 2048; do
+            for k in 512 1024; do
+
+                $EXE -m=$m -n=$n -k=$k -stride_a=0 -stride_b=0 -stride_c=0 -prec=$1 $COMMON_ARGS
+                if [ $? -eq 0 ]; then
+                    echo "Success: Test with batch=$batch, m=$m, n=$n, k=$k executed successfully."
+                else
+                    echo "Error: Test with batch=$batch, m=$m, n=$n, k=$k failed to execute properly."
+                    # Optionally, exit or break if you need to halt further execution
+                    # exit 1
+                fi
+
+            done
+        done
+    done
+}
+
+set -x
+
+run_tests "fp16"
+run_tests "bf16"
+run_tests "fp8"
+run_tests "bf8"
+run_tests "fp16i4"
+run_tests "fp8i4"
+run_tests "bf8i4"
+
+set +x
--- a/example/ck_tile/03_gemm/universal_gemm.cpp
+++ b/example/ck_tile/03_gemm/universal_gemm.cpp
@@ -0,0 +1,310 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include <hip/hip_runtime.h>
+
+#include <cstring>
+#include <iostream>
+#include <string>
+
+#include "gemm_utils.hpp"
+#include "run_gemm_example.inc"
+#include "run_gemm_example_common.hpp"
+#include "universal_gemm_invoker.hpp"
+
+// Universal GEMM-specific wrapper that handles test_async flag
+template <typename GemmConfig,
+          typename ADataType,
+          typename BDataType = ADataType,
+          typename CDataType = ADataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout>
+int run_gemm_example_with_layouts_universal(ck_tile::ArgParser& arg_parser,
+                                            const ALayout a_layout = ALayout{},
+                                            const BLayout b_layout = BLayout{},
+                                            const CLayout c_layout = CLayout{})
+{
+    using Invoker     = UniversalInvoker;
+    using AccDataType = typename GemmTypeConfig<ADataType, BDataType, CDataType>::AccDataType;
+
+    // Check for async input scheduler test mode
+    bool test_async = arg_parser.get_int("test_async");
+    if(test_async)
+    {
+        // Extract parameters for async test (same as shared implementation)
+        const ck_tile::index_t M      = arg_parser.get_int("m");
+        const ck_tile::index_t N      = arg_parser.get_int("n");
+        const ck_tile::index_t K      = arg_parser.get_int("k");
+        const ck_tile::index_t kbatch = arg_parser.get_int("split_k");
+
+        using Row                     = ck_tile::tensor_layout::gemm::RowMajor;
+        constexpr bool is_a_row_major = std::is_same_v<ALayout, Row>;
+        constexpr bool is_b_row_major = std::is_same_v<BLayout, Row>;
+        constexpr bool is_c_row_major = std::is_same_v<CLayout, Row>;
+
+        const ck_tile::index_t stride_A = is_a_row_major ? K : M;
+        const ck_tile::index_t stride_B = is_b_row_major ? N : K;
+        const ck_tile::index_t stride_C = is_c_row_major ? N : M;
+
+        // Allocate and initialize tensors
+        ck_tile::HostTensor<ADataType> a_m_k(ck_tile::host_tensor_descriptor(
+            M, K, stride_A, ck_tile::bool_constant<is_a_row_major>{}));
+        ck_tile::HostTensor<BDataType> b_k_n(ck_tile::host_tensor_descriptor(
+            K, N, stride_B, ck_tile::bool_constant<is_b_row_major>{}));
+        ck_tile::HostTensor<CDataType> c_m_n_dev_result(ck_tile::host_tensor_descriptor(
+            M, N, stride_C, ck_tile::bool_constant<is_c_row_major>{}));
+
+        ck_tile::FillUniformDistributionIntegerValue<ADataType>{-5, 5}(a_m_k);
+        ck_tile::FillUniformDistributionIntegerValue<BDataType>{-5, 5}(b_k_n);
+
+        ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes());
+
+        a_m_k_dev_buf.ToDevice(a_m_k.data());
+        b_k_n_dev_buf.ToDevice(b_k_n.data());
+        c_m_n_dev_buf.SetZero();
+        c_m_n_dev_result.SetZero();
+
+        ck_tile::GemmHostArgs args = {a_m_k_dev_buf.GetDeviceBuffer(),
+                                      b_k_n_dev_buf.GetDeviceBuffer(),
+                                      c_m_n_dev_buf.GetDeviceBuffer(),
+                                      kbatch,
+                                      M,
+                                      N,
+                                      K,
+                                      stride_A,
+                                      stride_B,
+                                      stride_C};
+
+        Invoker::template test_async_input_scheduler<GemmConfig,
+                                                     ADataType,
+                                                     BDataType,
+                                                     ck_tile::tuple<>,
+                                                     AccDataType,
+                                                     CDataType,
+                                                     ALayout,
+                                                     BLayout,
+                                                     ck_tile::tuple<>,
+                                                     CLayout,
+                                                     ck_tile::element_wise::PassThrough>(
+            args, ck_tile::stream_config{nullptr, false, 1});
+
+        // Copy result from device for verification
+        c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
+
+        // Compute CPU reference
+        ck_tile::HostTensor<CDataType> c_m_n_ref(ck_tile::host_tensor_descriptor(
+            M, N, stride_C, ck_tile::bool_constant<is_c_row_major>{}));
+        c_m_n_ref.SetZero();
+        ck_tile::reference_gemm<ADataType, BDataType, AccDataType, CDataType>(
+            a_m_k, b_k_n, c_m_n_ref);
+
+        // Verify results
+        const float max_accumulated_value =
+            *std::max_element(c_m_n_ref.mData.begin(), c_m_n_ref.mData.end());
+        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
+            K, kbatch, max_accumulated_value);
+        bool pass = do_verify(c_m_n_dev_result, c_m_n_ref, rtol_atol, "CPU");
+
+        std::cout << "Async input scheduler test: " << (pass ? "PASS" : "FAIL") << std::endl;
+        return pass;
+    }
+
+    // Normal path - delegate to shared implementation
+    return run_gemm_example_with_layouts<GemmConfig, Invoker, ADataType, BDataType, CDataType>(
+        arg_parser, a_layout, b_layout, c_layout);
+}
+
+// Universal GEMM-specific prec_type dispatcher that uses the wrapper
+template <typename GemmConfig,
+          typename APrecType,
+          typename BPrecType = APrecType,
+          typename CPrecType = APrecType>
+int run_gemm_example_prec_type_universal(std::string a_layout,
+                                         std::string b_layout,
+                                         ck_tile::ArgParser& arg_parser)
+{
+    using Row       = ck_tile::tensor_layout::gemm::RowMajor;
+    using Col       = ck_tile::tensor_layout::gemm::ColumnMajor;
+    bool preshuffle = GemmConfig::Preshuffle;
+
+    if(preshuffle && std::is_same_v<BPrecType, ck_tile::pk_int4_t>)
+    {
+        throw std::runtime_error("Preshuffle is not supported for this int4 datatype!");
+    }
+
+    if(preshuffle && a_layout != "R" && b_layout != "C")
+    {
+        throw std::runtime_error(
+            "Preshuffle is supported only for A(Row major), B(column major) input matrices!");
+    }
+
+    using LayoutVariant = std::variant<Row, Col>;
+
+    auto string_to_layout = [](const std::string& layout) -> LayoutVariant {
+        if(layout == "R")
+            return Row{};
+        if(layout == "C")
+            return Col{};
+        throw std::runtime_error("Unsupported layout: " + layout);
+    };
+
+    auto a_layout_variant = string_to_layout(a_layout);
+    auto b_layout_variant = string_to_layout(b_layout);
+
+    return std::visit(
+        [&](auto a_layout_type, auto b_layout_type) -> int {
+            if constexpr(std::is_same_v<BPrecType, ck_tile::pk_int4_t> &&
+                         std::is_same_v<decltype(b_layout_type), Row>)
+            {
+                throw std::runtime_error("Unsupported memory layout for the input matrices when "
+                                         "BPrecType is ck_tile::pk_int4_t!");
+            }
+            else
+            {
+                return run_gemm_example_with_layouts_universal<GemmConfig,
+                                                               APrecType,
+                                                               BPrecType,
+                                                               CPrecType>(
+                    arg_parser, a_layout_type, b_layout_type, Row{});
+            }
+        },
+        a_layout_variant,
+        b_layout_variant);
+}
+
+template <template <typename PrecType> typename GemmConfig>
+int run_gemm_example(ck_tile::ArgParser& arg_parser)
+{
+    std::string data_type = arg_parser.get_str("prec");
+    std::string a_layout  = arg_parser.get_str("a_layout");
+    std::string b_layout  = arg_parser.get_str("b_layout");
+
+    if(data_type == "fp16")
+    {
+        return run_gemm_example_prec_type_universal<GemmConfig<ck_tile::half_t>, ck_tile::half_t>(
+            a_layout, b_layout, arg_parser);
+    }
+    else if(data_type == "bf16")
+    {
+        return run_gemm_example_prec_type_universal<GemmConfig<ck_tile::bf16_t>, ck_tile::bf16_t>(
+            a_layout, b_layout, arg_parser);
+    }
+    else if(data_type == "fp8")
+    {
+        return run_gemm_example_prec_type_universal<GemmConfig<ck_tile::fp8_t>,
+                                                    ck_tile::fp8_t,
+                                                    ck_tile::fp8_t,
+                                                    ck_tile::half_t>(
+            a_layout, b_layout, arg_parser);
+    }
+    else if(data_type == "bf8")
+    {
+        return run_gemm_example_prec_type_universal<GemmConfig<ck_tile::bf8_t>,
+                                                    ck_tile::bf8_t,
+                                                    ck_tile::bf8_t,
+                                                    ck_tile::half_t>(
+            a_layout, b_layout, arg_parser);
+    }
+    else if(data_type == "int8")
+    {
+        return run_gemm_example_prec_type_universal<GemmConfig<ck_tile::int8_t>,
+                                                    ck_tile::int8_t,
+                                                    ck_tile::int8_t,
+                                                    ck_tile::int32_t>(
+            a_layout, b_layout, arg_parser);
+    }
+    else if(data_type == "fp16i4")
+    {
+        // TODO: Add support for bhalf_t ADataType
+        if constexpr(GemmConfig<ck_tile::half_t>::Pipeline == ck_tile::GemmPipeline::COMPUTE_V3)
+        {
+            return run_gemm_example_prec_type_universal<GemmConfig<ck_tile::half_t>,
+                                                        ck_tile::half_t,
+                                                        ck_tile::pk_int4_t,
+                                                        ck_tile::half_t>(
+                a_layout, b_layout, arg_parser);
+        }
+        else
+        {
+            throw std::runtime_error("Unsupported pipeline for this operation !!!");
+        }
+    }
+    else if(data_type == "fp8i4")
+    {
+        if constexpr(GemmConfig<ck_tile::fp8_t>::Pipeline == ck_tile::GemmPipeline::COMPUTE_V3)
+        {
+            return run_gemm_example_prec_type_universal<GemmConfig<ck_tile::fp8_t>,
+                                                        ck_tile::fp8_t,
+                                                        ck_tile::pk_int4_t,
+                                                        ck_tile::half_t>(
+                a_layout, b_layout, arg_parser);
+        }
+        else
+        {
+            throw std::runtime_error("Unsupported pipeline for this operation !!!");
+        }
+    }
+    else if(data_type == "bf8i4")
+    {
+        if constexpr(GemmConfig<ck_tile::bf8_t>::Pipeline == ck_tile::GemmPipeline::COMPUTE_V3)
+        {
+            return run_gemm_example_prec_type_universal<GemmConfig<ck_tile::bf8_t>,
+                                                        ck_tile::bf8_t,
+                                                        ck_tile::pk_int4_t,
+                                                        ck_tile::half_t>(
+                a_layout, b_layout, arg_parser);
+        }
+        else
+        {
+            throw std::runtime_error("Unsupported pipeline for this operation !!!");
+        }
+    }
+    if(data_type == "fp4")
+    {
+        if constexpr(GemmConfig<ck_tile::pk_fp4_t>::Pipeline ==
+                         ck_tile::GemmPipeline::COMPUTE_ASYNC &&
+                     GemmConfig<ck_tile::pk_fp4_t>::K_Warp_Tile == 128)
+        {
+            return run_gemm_example_prec_type_universal<GemmConfig<ck_tile::pk_fp4_t>,
+                                                        ck_tile::pk_fp4_t,
+                                                        ck_tile::pk_fp4_t,
+                                                        ck_tile::half_t>(
+                a_layout, b_layout, arg_parser);
+        }
+        else
+        {
+            throw std::runtime_error("Unsupported pipeline for this operation !!!");
+        }
+    }
+    else
+    {
+        throw std::runtime_error("Unsupported data type for this operation !!!");
+    }
+}
+
+int main(int argc, char* argv[])
+{
+    auto arg_parser = create_args();
+    auto result     = arg_parser.parse(argc, argv);
+
+    if(!result)
+        return -1;
+
+    try
+    {
+#if CK_TILE_USE_WMMA
+        return !run_gemm_example<GemmConfigComputeV3_WMMA>(arg_parser);
+#else
+        return !run_gemm_example<GemmConfigComputeV3_2>(arg_parser);
+#endif
+    }
+    catch(const std::runtime_error& e)
+    {
+        std::cerr << "Caught runtime error: " << e.what() << '\n';
+        // Return a non-zero code to indicate failure
+        return EXIT_FAILURE;
+    }
+}
--- a/example/ck_tile/03_gemm/universal_gemm_invoker.hpp
+++ b/example/ck_tile/03_gemm/universal_gemm_invoker.hpp
@@ -0,0 +1,323 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+#pragma once
+#include <functional>
+#include <chrono>
+#include <thread>
+#include "gemm_utils.hpp"
+#include "ck_tile/host/hip_check_error.hpp"
+#include "ck_tile/host/device_memory.hpp"
+
+struct UniversalInvoker
+{
+    template <typename GemmConfig,
+              typename ADataType,
+              typename BDataType,
+              typename DsDataType,
+              typename AccDataType,
+              typename CDataType,
+              typename ALayout,
+              typename BLayout,
+              typename DsLayout,
+              typename ELayout,
+              bool Persistent,
+              typename CDEElementWise>
+    static float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s)
+
+    {
+        using GemmShape = ck_tile::TileGemmShape<
+            ck_tile::sequence<GemmConfig::M_Tile, GemmConfig::N_Tile, GemmConfig::K_Tile>,
+            ck_tile::sequence<GemmConfig::M_Warp, GemmConfig::N_Warp, GemmConfig::K_Warp>,
+            ck_tile::
+                sequence<GemmConfig::M_Warp_Tile, GemmConfig::N_Warp_Tile, GemmConfig::K_Warp_Tile>,
+            GemmConfig::PermuteA,
+            GemmConfig::PermuteB>;
+
+        using TilePartitioner =
+            ck_tile::GemmSpatiallyLocalTilePartitioner<GemmShape,
+                                                       GemmConfig::TileParitionerGroupNum,
+                                                       GemmConfig::TileParitionerM01>;
+
+        using GemmUniversalTraits =
+            ck_tile::TileGemmUniversalTraits<GemmConfig::kPadM,
+                                             GemmConfig::kPadN,
+                                             GemmConfig::kPadK,
+                                             GemmConfig::DoubleSmemBuffer,
+                                             ALayout,
+                                             BLayout,
+                                             ELayout,
+                                             GemmConfig::TransposeC,
+                                             GemmConfig::UseStructuredSparsity,
+                                             Persistent,
+                                             GemmConfig::NumWaveGroups,
+                                             GemmConfig::Preshuffle>;
+
+        constexpr auto scheduler = GemmConfig::Scheduler;
+
+        using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                                           BDataType,
+                                                                           AccDataType,
+                                                                           GemmShape,
+                                                                           GemmUniversalTraits,
+                                                                           scheduler>;
+
+        using GemmPipeline = typename PipelineTypeTraits<
+            GemmConfig::Pipeline>::template GemmPipeline<UniversalGemmProblem>;
+
+        using GemmEpilogue = ck_tile::CShuffleEpilogue<
+            ck_tile::CShuffleEpilogueProblem<ADataType,
+                                             BDataType,
+                                             DsDataType,
+                                             AccDataType,
+                                             CDataType,
+                                             DsLayout,
+                                             ELayout,
+                                             CDEElementWise,
+                                             TilePartitioner::MPerBlock,
+                                             TilePartitioner::NPerBlock,
+                                             GemmConfig::M_Warp,
+                                             GemmConfig::N_Warp,
+                                             GemmConfig::M_Warp_Tile,
+                                             GemmConfig::N_Warp_Tile,
+                                             GemmConfig::K_Warp_Tile,
+                                             UniversalGemmProblem::TransposeC,
+                                             GemmConfig::NumWaveGroups,
+                                             false, /*FixedVectorSize_*/
+                                             1,     /*VectorSizeC_*/
+                                             false, /*TiledMMAPermuteN_*/
+                                             1,     /*BlockedXDLN_PerWarp_*/
+                                             GemmConfig::DoubleSmemBuffer /*DoubleSmemBuffer*/>>;
+
+        using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
+
+        auto kargs = Kernel::MakeKernelArgs(args);
+
+        const dim3 grids  = Persistent ? Kernel::MaxOccupancyGridSize(s)
+                                       : Kernel::GridSize(args.M, args.N, args.k_batch);
+        const dim3 blocks = Kernel::BlockSize();
+
+        if(!Kernel::IsSupportedArgument(kargs))
+        {
+            throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
+        }
+
+        if(s.log_level_ > 0)
+        {
+            std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
+                      << "shape: " << GemmShape::GetName() << '\n'
+                      << "problem: " << UniversalGemmProblem::GetName() << '\n'
+                      << "pipeline: " << GemmPipeline::GetName() << '\n'
+                      << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                      << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
+                      << std::endl;
+        }
+
+        // Declare rotating_mem_ptr here so it stays in scope until it is needed
+        std::unique_ptr<ck_tile::RotatingMemWrapper<ADataType, BDataType>> rotating_mem_ptr;
+        std::function<void()> preprocess;
+
+        auto clear_gemm_output = [&]() {
+            if(args.k_batch > 1)
+                hipGetErrorString(hipMemsetAsync(
+                    args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
+        };
+
+        if(s.flush_cache_)
+        {
+            std::cout << "Flushing cache..." << std::endl;
+
+            ck_tile::HostTensor<ADataType> a_m(ck_tile::host_tensor_descriptor(
+                args.M, args.K, args.stride_A, is_row_major(ALayout{})));
+            ck_tile::HostTensor<BDataType> b_n(ck_tile::host_tensor_descriptor(
+                args.K, args.N, args.stride_B, is_row_major(BLayout{})));
+
+            auto size_a_buffer = a_m.get_element_space_size_in_bytes();
+            auto size_b_buffer = b_n.get_element_space_size_in_bytes();
+
+            rotating_mem_ptr = std::make_unique<ck_tile::RotatingMemWrapper<ADataType, BDataType>>(
+                kargs.as_ptr[0], kargs.bs_ptr[0], s.rotating_count_, size_a_buffer, size_b_buffer);
+            rotating_mem_ptr->Print();
+
+            preprocess = [&]() {
+                ck_tile::flush_icache();
+                rotating_mem_ptr->Next();
+                clear_gemm_output();
+            };
+        }
+        else
+        {
+            preprocess = clear_gemm_output;
+        }
+
+        return ck_tile::launch_kernel_time_mask(
+            s,
+            preprocess,
+            ck_tile::make_kernel<GemmConfig::kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+    }
+
+    template <typename GemmConfig,
+              typename ADataType,
+              typename BDataType,
+              typename DsDataType,
+              typename AccDataType,
+              typename CDataType,
+              typename ALayout,
+              typename BLayout,
+              typename DsLayout,
+              typename ELayout,
+              typename CDEElementWise>
+    static void test_async_input_scheduler(const ck_tile::GemmHostArgs& args,
+                                           const ck_tile::stream_config& s)
+    {
+        using GemmShape = ck_tile::TileGemmShape<
+            ck_tile::sequence<GemmConfig::M_Tile, GemmConfig::N_Tile, GemmConfig::K_Tile>,
+            ck_tile::sequence<GemmConfig::M_Warp, GemmConfig::N_Warp, GemmConfig::K_Warp>,
+            ck_tile::
+                sequence<GemmConfig::M_Warp_Tile, GemmConfig::N_Warp_Tile, GemmConfig::K_Warp_Tile>,
+            GemmConfig::PermuteA,
+            GemmConfig::PermuteB>;
+
+        using TilePartitioner =
+            ck_tile::GemmSpatiallyLocalTilePartitioner<GemmShape,
+                                                       GemmConfig::TileParitionerGroupNum,
+                                                       GemmConfig::TileParitionerM01>;
+
+        using GemmUniversalTraits =
+            ck_tile::TileGemmUniversalTraits<GemmConfig::kPadM,
+                                             GemmConfig::kPadN,
+                                             GemmConfig::kPadK,
+                                             GemmConfig::DoubleSmemBuffer,
+                                             ALayout,
+                                             BLayout,
+                                             ELayout,
+                                             GemmConfig::TransposeC,
+                                             GemmConfig::UseStructuredSparsity,
+                                             true, // Persistent = true for async test
+                                             GemmConfig::NumWaveGroups,
+                                             GemmConfig::Preshuffle>;
+
+        constexpr auto scheduler = GemmConfig::Scheduler;
+
+        using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                                           BDataType,
+                                                                           AccDataType,
+                                                                           GemmShape,
+                                                                           GemmUniversalTraits,
+                                                                           scheduler>;
+
+        using GemmPipeline = typename PipelineTypeTraits<
+            GemmConfig::Pipeline>::template GemmPipeline<UniversalGemmProblem>;
+
+        using GemmEpilogue = ck_tile::CShuffleEpilogue<
+            ck_tile::CShuffleEpilogueProblem<ADataType,
+                                             BDataType,
+                                             DsDataType,
+                                             AccDataType,
+                                             CDataType,
+                                             DsLayout,
+                                             ELayout,
+                                             CDEElementWise,
+                                             TilePartitioner::MPerBlock,
+                                             TilePartitioner::NPerBlock,
+                                             GemmConfig::M_Warp,
+                                             GemmConfig::N_Warp,
+                                             GemmConfig::M_Warp_Tile,
+                                             GemmConfig::N_Warp_Tile,
+                                             GemmConfig::K_Warp_Tile,
+                                             UniversalGemmProblem::TransposeC,
+                                             GemmConfig::NumWaveGroups,
+                                             false, /*FixedVectorSize_*/
+                                             1,     /*VectorSizeC_*/
+                                             false, /*TiledMMAPermuteN_*/
+                                             1,     /*BlockedXDLN_PerWarp_*/
+                                             GemmConfig::DoubleSmemBuffer>>;
+
+        using Kernel = ck_tile::GemmKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
+
+        const ck_tile::index_t tiles_m =
+            ck_tile::integer_divide_ceil(args.M, TilePartitioner::MPerBlock);
+        // Balance signal granularity (smaller chunks = finer control) vs overhead (more signals)
+        const ck_tile::index_t tiles_per_chunk = 2;
+        // Shift chunk assignments to test wraparound behavior
+        const ck_tile::index_t tile_idx_pivot = tiles_per_chunk;
+        // Account for pivot when allocating signal buffer
+        const ck_tile::index_t num_chunks =
+            ck_tile::integer_divide_ceil(tiles_m + tile_idx_pivot, tiles_per_chunk);
+
+        std::cout << "Async Input Scheduler Test:" << std::endl;
+        std::cout << "  M tiles: " << tiles_m << std::endl;
+        std::cout << "  Tiles per chunk: " << tiles_per_chunk << std::endl;
+        std::cout << "  Tile index pivot: " << tile_idx_pivot << std::endl;
+        std::cout << "  Number of signal chunks: " << num_chunks << std::endl;
+
+        // Signals must start as zero so kernel blocks until producer sets them
+        ck_tile::DeviceMem signal_buf(num_chunks * sizeof(uint32_t));
+        signal_buf.SetZero();
+        uint32_t* d_chunk_signals = static_cast<uint32_t*>(signal_buf.GetDeviceBuffer());
+
+        // Setup async input scheduler
+        ck_tile::PersistentAsyncInputScheduler async_scheduler;
+        async_scheduler.tiles_per_chunk_m = tiles_per_chunk;
+        async_scheduler.chunk_signals     = d_chunk_signals;
+        async_scheduler.tile_idx_pivot_m  = tile_idx_pivot;
+        async_scheduler.num_chunks        = num_chunks;
+
+        // Create modified host args with async scheduler
+        ck_tile::UniversalGemmHostArgs<1, 1, 0> host_args({args.a_ptr},
+                                                          {args.b_ptr},
+                                                          {},
+                                                          args.e_ptr,
+                                                          args.k_batch,
+                                                          args.M,
+                                                          args.N,
+                                                          args.K,
+                                                          {args.stride_A},
+                                                          {args.stride_B},
+                                                          {},
+                                                          args.stride_E,
+                                                          async_scheduler);
+
+        auto kargs = Kernel::UniversalGemmKernel::MakeKernelArgs(host_args);
+
+        const dim3 grids  = Kernel::MaxOccupancyGridSize(s);
+        const dim3 blocks = Kernel::BlockSize();
+
+        std::cout << "  Grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                  << std::endl;
+        std::cout << "  Blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
+                  << std::endl;
+
+        // Separate stream prevents deadlock: kernel and signal producer must run concurrently
+        hipStream_t signal_stream;
+        HIP_CHECK_ERROR(hipStreamCreateWithFlags(&signal_stream, hipStreamNonBlocking));
+
+        const auto start = std::chrono::high_resolution_clock::now();
+
+        ck_tile::launch_kernel(
+            s, ck_tile::make_kernel<GemmConfig::kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+
+        // Simulate incremental input arrival by delaying signal activation
+        const int sleep_us = 100;
+        for(ck_tile::index_t i = 0; i < num_chunks; ++i)
+        {
+            std::this_thread::sleep_for(std::chrono::microseconds(sleep_us));
+            const uint32_t signal_val = 1;
+            HIP_CHECK_ERROR(hipMemcpyAsync(d_chunk_signals + i,
+                                           &signal_val,
+                                           sizeof(uint32_t),
+                                           hipMemcpyHostToDevice,
+                                           signal_stream));
+        }
+        HIP_CHECK_ERROR(hipStreamSynchronize(signal_stream));
+        HIP_CHECK_ERROR(hipStreamDestroy(signal_stream));
+
+        // Wait for kernel completion
+        HIP_CHECK_ERROR(hipDeviceSynchronize());
+
+        auto duration = std::chrono::duration_cast<std::chrono::microseconds>(
+            std::chrono::high_resolution_clock::now() - start);
+
+        std::cout << "  Total time: " << duration.count() << " us" << std::endl;
+        std::cout << "  Sleep time: " << (num_chunks * sleep_us) << " us" << std::endl;
+    }
+};
--- a/example/ck_tile/04_img2col/CMakeLists.txt
+++ b/example/ck_tile/04_img2col/CMakeLists.txt
@@ -0,0 +1,6 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+# not using add_example_executable() to add this target, since we don't want this to have
+# to be included in "make all/install/check"
+add_executable(tile_example_img2col image_to_column.cpp)
--- a/example/ck_tile/04_img2col/README.md
+++ b/example/ck_tile/04_img2col/README.md
@@ -0,0 +1,51 @@
+# Image to Column (im2col) with CK Tile
+
+This example demonstrates the im2col transformation using the CK Tile programming model, a key step for converting convolution into GEMM for efficient GPU execution.
+
+---
+
+## Algorithm and Math
+
+Given an input image tensor $X$ and convolution kernel size, im2col rearranges sliding windows of $X$ into columns:
+- For each patch, flatten and stack as a column in the output matrix.
+- Enables convolution as matrix multiplication: $\text{im2col}(X) \times W$.
+
+---
+
+## Tile Programming Model
+
+- **Tiles**: Each thread block processes a tile (block of patches).
+- **Pipeline**: Modular, can be extended for fused operations (e.g., quantization, activation).
+
+---
+
+## Build & Run
+
+```bash
+mkdir build && cd build
+# you can replace <arch> with the appropriate architecture (for example gfx90a or gfx942) or leave it blank
+../script/cmake-ck-dev.sh  ../ <arch>
+make tile_example_img2col -j
+./bin/tile_example_img2col -?
+```
+
+---
+
+## Source Structure
+
+- **Kernel**: `image_to_column.hpp` (tile-programming kernel template)
+- **Executable**: `image_to_column.cpp` (argument parsing, kernel launch)
+- **Build**: `CMakeLists.txt`
+
+---
+
+## Related CK Tile Examples
+
+- [03_gemm](../03_gemm/README.md): GEMM with tiles (im2col output as input)
+- [05_reduce](../05_reduce/README.md): Reductions with tiles
+- [06_permute](../06_permute/README.md): Permutation with tiles
+
+For distribution, see `include/ck_tile/tile_program/tile_distribution/`.
+
+---
+[Back to CK Tile Examples](../README.md)
--- a/example/ck_tile/04_img2col/image_to_column.cpp
+++ b/example/ck_tile/04_img2col/image_to_column.cpp
@@ -0,0 +1,177 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include <algorithm>
+#include <cstring>
+
+#include "ck_tile/host.hpp"
+#include "image_to_column.hpp"
+
+// Host API implementation
+template <>
+float image_to_column(const image_to_column_traits& traits,
+                      const image_to_column_args<2>& args,
+                      const ck_tile::stream_config& stream_conf)
+{
+    if(traits.data_type.compare("fp16") == 0)
+    {
+        constexpr ck_tile::index_t NDimSpatial = 2;
+        constexpr ck_tile::index_t VectorSize  = 8;
+
+        using thread_tile = ck_tile::sequence<8, 8>;
+        using warp_tile   = ck_tile::sequence<64, 64>;
+        using block_tile  = ck_tile::sequence<128, 128>;
+
+        using Shape = ck_tile::TileImageToColumnShape<thread_tile, warp_tile, block_tile>;
+
+        using InDataType  = ck_tile::half_t;
+        using OutDataType = ck_tile::half_t;
+
+        using PipelineProblem = ck_tile::BlockImageToColumnProblem<InDataType,
+                                                                   OutDataType,
+                                                                   Shape,
+                                                                   NDimSpatial,
+                                                                   VectorSize,
+                                                                   VectorSize>;
+
+        using Kernel = ck_tile::ImageToColumn<PipelineProblem>;
+
+        auto kargs = Kernel::MakeKargs(args.p_in,
+                                       args.p_out,
+                                       args.G,
+                                       args.N,
+                                       args.C,
+                                       args.input_spatial_lengths,
+                                       args.filter_spatial_lengths,
+                                       args.output_spatial_lengths,
+                                       args.image_g_n_c_wis_strides,
+                                       args.gemm_g_m_k_strides,
+                                       args.conv_filter_strides,
+                                       args.conv_filter_dilations,
+                                       args.input_left_pads,
+                                       args.input_right_pads);
+
+        const dim3 grids = Kernel::GridSize(
+            args.N * args.output_spatial_lengths[0] * args.output_spatial_lengths[1],
+            args.filter_spatial_lengths[0] * args.filter_spatial_lengths[1] * args.C,
+            args.G);
+        const dim3 blocks = Kernel::BlockSize();
+
+        constexpr ck_tile::index_t kBlockPerCu = 2;
+
+        float ave_time = ck_tile::launch_kernel(
+            stream_conf, ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+
+        return ave_time;
+    }
+
+    return 0;
+}
+
+int main(int argc, char* argv[])
+{
+    constexpr ck_tile::index_t NDimSpatial = 2;
+
+    ExecutionConfig config;
+    ck_tile::conv::ConvParam conv_params = DefaultConvParams;
+
+    if(!parse_cmd_args(argc, argv, config, conv_params))
+    {
+        return EXIT_FAILURE;
+    }
+
+    if(conv_params.num_dim_spatial_ != NDimSpatial)
+    {
+        std::cerr << "unsupported # of spatial dimensions" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    using InDataType  = ck_tile::half_t;
+    using OutDataType = ck_tile::half_t;
+    using ImLayout    = ck_tile::tensor_layout::convolution::NHWGC;
+
+    const auto G = conv_params.G_;
+    const auto N = conv_params.N_;
+    const auto C = conv_params.C_;
+
+    const ck_tile::long_index_t NHoWo =
+        N * std::accumulate(conv_params.output_spatial_lengths_.begin(),
+                            std::next(conv_params.output_spatial_lengths_.begin(), NDimSpatial),
+                            1,
+                            std::multiplies<>());
+
+    const ck_tile::long_index_t CYX =
+        C * std::accumulate(conv_params.filter_spatial_lengths_.begin(),
+                            std::next(conv_params.filter_spatial_lengths_.begin(), NDimSpatial),
+                            1,
+                            std::multiplies<>());
+
+    const auto in_desc =
+        ck_tile::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<ImLayout>(conv_params);
+    const auto out_desc = ck_tile::HostTensorDescriptor({G, NHoWo, CYX});
+
+    // host verify
+    ck_tile::HostTensor<InDataType> in(in_desc);
+    ck_tile::HostTensor<OutDataType> out_device(out_desc);
+    ck_tile::HostTensor<OutDataType> out_host(out_desc);
+
+    switch(config.init_method)
+    {
+    case 0: break;
+    case 1: ck_tile::FillUniformDistributionIntegerValue<InDataType>{-5.f, 5.f}(in); break;
+    default: ck_tile::FillUniformDistribution<InDataType>{-0.5, 0.5}(in); break;
+    }
+
+    ck_tile::DeviceMem in_device_buf(in.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem out_device_buf(out_device.get_element_space_size_in_bytes());
+
+    in_device_buf.ToDevice(in.data());
+
+    image_to_column_traits traits{"fp16"};
+
+    image_to_column_args<NDimSpatial> args{
+        in_device_buf.GetDeviceBuffer(),
+        out_device_buf.GetDeviceBuffer(),
+        G,
+        N,
+        C,
+        ck_tile::to_array<ck_tile::long_index_t, NDimSpatial>(conv_params.input_spatial_lengths_),
+        ck_tile::to_array<ck_tile::long_index_t, NDimSpatial>(conv_params.filter_spatial_lengths_),
+        ck_tile::to_array<ck_tile::long_index_t, NDimSpatial>(conv_params.output_spatial_lengths_),
+        ck_tile::to_array<ck_tile::long_index_t, NDimSpatial + 3>(in_desc.get_strides()),
+        ck_tile::to_array<ck_tile::long_index_t, 3>(out_desc.get_strides()),
+        ck_tile::to_array<ck_tile::long_index_t, NDimSpatial>(conv_params.conv_filter_strides_),
+        ck_tile::to_array<ck_tile::long_index_t, NDimSpatial>(conv_params.conv_filter_dilations_),
+        ck_tile::to_array<ck_tile::long_index_t, NDimSpatial>(conv_params.input_left_pads_),
+        ck_tile::to_array<ck_tile::long_index_t, NDimSpatial>(conv_params.input_right_pads_)};
+
+    float ave_time =
+        image_to_column(traits, args, ck_tile::stream_config{nullptr, config.time_kernel});
+
+    if(config.time_kernel)
+    {
+        std::size_t num_btype = G * NHoWo * CYX * (sizeof(OutDataType) + sizeof(InDataType));
+        float gb_per_sec      = num_btype / 1.E6 / ave_time;
+        std::cout << "Perf: " << ave_time << " ms, " << gb_per_sec << " GB/s" << std::endl;
+    }
+    else
+    {
+        std::cout << "image_to_column: pass, No Perf generated due to config.time_kernel=0"
+                  << std::endl;
+    }
+
+    bool pass = true;
+
+    if(config.do_verification)
+    {
+        // reference
+        ck_tile::reference_im2col<InDataType, OutDataType, NDimSpatial>(in, out_host, conv_params);
+
+        out_device_buf.FromDevice(out_device.data());
+        pass = ck_tile::check_err(out_device, out_host);
+
+        std::cout << "valid:" << (pass ? "y" : "n") << std::endl;
+    }
+
+    return !pass;
+}
--- a/example/ck_tile/04_img2col/image_to_column.hpp
+++ b/example/ck_tile/04_img2col/image_to_column.hpp
@@ -0,0 +1,105 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/image_to_column.hpp"
+#include <string>
+
+#define DefaultConvParams                                                    \
+    ck_tile::conv::ConvParam                                                 \
+    {                                                                        \
+        2, 2, 32, 32, 32, {4, 4}, {64, 64}, {1, 1}, {1, 1}, {0, 0}, { 0, 0 } \
+    }
+
+struct ExecutionConfig final
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+};
+
+inline void print_help_msg()
+{
+    std::cerr << "arg1: verification (0=no, 1=yes)\n"
+              << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
+              << "arg3: time kernel (0=no, 1=yes)\n"
+              << ck_tile::conv::get_conv_param_parser_helper_msg() << std::endl;
+}
+
+inline bool parse_cmd_args(int argc,
+                           char* argv[],
+                           ExecutionConfig& config,
+                           ck_tile::conv::ConvParam& conv_params)
+{
+    constexpr int num_execution_config_args =
+        3; // arguments for do_verification, init_method, time_kernel
+    constexpr int num_conv_param_leading_args = 5; // arguments for num_dim_spatial_, G_, N_, K_, C_
+
+    constexpr int threshold_to_catch_partial_args = 1 + num_execution_config_args;
+    constexpr int threshold_to_catch_all_args =
+        threshold_to_catch_partial_args + num_conv_param_leading_args;
+
+    if(argc == 1)
+    {
+        // use default
+        config = ExecutionConfig{};
+    }
+    // catch only ExecutionConfig arguments
+    else if(argc == threshold_to_catch_partial_args)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+    }
+    // catch both ExecutionConfig & ConvParam arguments
+    else if(threshold_to_catch_all_args < argc && ((argc - threshold_to_catch_all_args) % 3 == 0))
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+
+        const ck_tile::index_t num_dim_spatial = std::stoi(argv[4]);
+        conv_params =
+            ck_tile::conv::parse_conv_param(num_dim_spatial, threshold_to_catch_partial_args, argv);
+    }
+    else
+    {
+        print_help_msg();
+        return false;
+    }
+
+    return true;
+}
+
+struct image_to_column_traits
+{
+    std::string data_type;
+};
+
+template <ck_tile::index_t NDimSpatial>
+struct image_to_column_args
+{
+    const void* p_in;
+    void* p_out;
+    const ck_tile::long_index_t G;
+    const ck_tile::long_index_t N;
+    const ck_tile::long_index_t C;
+    const ck_tile::array<ck_tile::long_index_t, NDimSpatial> input_spatial_lengths;
+    const ck_tile::array<ck_tile::long_index_t, NDimSpatial> filter_spatial_lengths;
+    const ck_tile::array<ck_tile::long_index_t, NDimSpatial> output_spatial_lengths;
+    const ck_tile::array<ck_tile::long_index_t, NDimSpatial + 3> image_g_n_c_wis_strides;
+    const ck_tile::array<ck_tile::long_index_t, 3> gemm_g_m_k_strides;
+    const ck_tile::array<ck_tile::long_index_t, NDimSpatial> conv_filter_strides;
+    const ck_tile::array<ck_tile::long_index_t, NDimSpatial> conv_filter_dilations;
+    const ck_tile::array<ck_tile::long_index_t, NDimSpatial> input_left_pads;
+    const ck_tile::array<ck_tile::long_index_t, NDimSpatial> input_right_pads;
+};
+
+// host API
+template <ck_tile::index_t NDimSpatial>
+float image_to_column(const image_to_column_traits&,
+                      const image_to_column_args<NDimSpatial>&,
+                      const ck_tile::stream_config&);
--- a/example/ck_tile/05_reduce/CMakeLists.txt
+++ b/example/ck_tile/05_reduce/CMakeLists.txt
@@ -0,0 +1,38 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+set(EXAMPLE_REDUCE "tile_example_reduce")
+# not using add_example_executable() to add this target, since we don't want this to have
+# to be included in "make all/install/check"
+message(DEBUG "adding example ${EXAMPLE_REDUCE}")
+
+add_executable(${EXAMPLE_REDUCE} reduce.cpp)
+target_include_directories(${EXAMPLE_REDUCE} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
+set(EXAMPLE_REDUCE_COMPILE_OPTIONS)
+
+# NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
+list(APPEND EXAMPLE_REDUCE_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal)
+
+target_compile_options(${EXAMPLE_REDUCE} PRIVATE ${EXAMPLE_REDUCE_COMPILE_OPTIONS})
+
+# Multi Reduce Threadwise Example
+set(EXAMPLE_MULTI_REDUCE "tile_example_multi_reduce_threadwise")
+add_executable(${EXAMPLE_MULTI_REDUCE} EXCLUDE_FROM_ALL multiple_reduce_threadwise.cpp)
+target_include_directories(${EXAMPLE_MULTI_REDUCE} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
+set(EXAMPLE_MULTI_REDUCE_COMPILE_OPTIONS)
+list(APPEND EXAMPLE_MULTI_REDUCE_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal)
+target_compile_options(${EXAMPLE_MULTI_REDUCE} PRIVATE ${EXAMPLE_MULTI_REDUCE_COMPILE_OPTIONS})
+
+# Multi Reduce Blockwise Example
+set(EXAMPLE_MULTI_REDUCE_BLOCKWISE "tile_example_multi_reduce_multiblock")
+add_executable(${EXAMPLE_MULTI_REDUCE_BLOCKWISE} EXCLUDE_FROM_ALL multiple_reduce_multiblock.cpp)
+target_include_directories(${EXAMPLE_MULTI_REDUCE_BLOCKWISE} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
+set(EXAMPLE_MULTI_REDUCE_BLOCKWISE_COMPILE_OPTIONS)
+list(APPEND EXAMPLE_MULTI_REDUCE_BLOCKWISE_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal)
+target_compile_options(${EXAMPLE_MULTI_REDUCE_BLOCKWISE} PRIVATE ${EXAMPLE_MULTI_REDUCE_BLOCKWISE_COMPILE_OPTIONS})
+
+# TODO: we have to turn off this global prop, otherwise the progress bar generated
+# by cmake will print too many files, execvp: /bin/sh: Argument list too long
+# however, this property may affect global
+# TODO: consider codegen a makefile by us
+set_property(GLOBAL PROPERTY RULE_MESSAGES OFF)
--- a/example/ck_tile/05_reduce/README.md
+++ b/example/ck_tile/05_reduce/README.md
@@ -0,0 +1,53 @@
+# Reduction with CK Tile
+
+This example demonstrates parallel reduction (sum, max, etc.) using the CK Tile programming model, a core operation for normalization, statistics, and aggregation in deep learning.
+
+---
+
+## Algorithm and Math
+
+Given a tensor $X$ and a reduction axis, compute:
+- **Sum**: $Y = \sum_i X_i$
+- **Max**: $Y = \max_i X_i$
+- **Mean**: $Y = \frac{1}{N} \sum_i X_i$
+
+- **Tilewise Reduction**: Each thread block reduces a tile (block) of the input, using shared memory and register accumulation for efficiency.
+
+---
+
+## Tile Programming Model
+
+- **Tiles**: Each thread block processes a tile (block) of the input tensor.
+- **Pipeline**: Modular, can be extended for fused reductions or post-processing.
+
+---
+
+## Build & Run
+
+```bash
+mkdir build && cd build
+sh ../script/cmake-ck-dev.sh ../ <arch>
+make tile_example_reduce -j
+./bin/tile_example_reduce -?
+```
+
+---
+
+## Source Structure
+
+- **Kernel**: `reduce.hpp` (tile-programming kernel template)
+- **Executable**: `reduce.cpp` (argument parsing, kernel launch)
+- **Build**: `CMakeLists.txt`
+
+---
+
+## Related CK Tile Examples
+
+- [03_gemm](../03_gemm/README.md): GEMM with tiles
+- [04_img2col](../04_img2col/README.md): im2col transformation
+- [06_permute](../06_permute/README.md): Permutation with tiles
+
+For distribution, see `include/ck_tile/tile_program/tile_distribution/`.
+
+---
+[Back to CK Tile Examples](../README.md)
--- a/example/ck_tile/05_reduce/multiple_reduce_multiblock.cpp
+++ b/example/ck_tile/05_reduce/multiple_reduce_multiblock.cpp
@@ -0,0 +1,271 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "ck_tile/host.hpp"
+#include "ck_tile/ops/reduce.hpp"
+#include "ck_tile/utility/json_dump.hpp"
+#include <cstring>
+
+template <typename T>
+struct DataTypeTraits;
+
+template <>
+struct DataTypeTraits<ck_tile::half_t>
+{
+    static constexpr const char* name = "fp16";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::bf16_t>
+{
+    static constexpr const char* name = "bf16";
+};
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("n", "32", "n dimension")
+        .insert("h", "19", "h dimension")
+        .insert("w", "7", "w dimension")
+        .insert("c", "512", "c dimension")
+        .insert("v", "1", "cpu validation or not")
+        .insert("prec", "fp16", "precision")
+        .insert("warmup", "5", "cold iter")
+        .insert("repeat", "20", "hot iter")
+        .insert("json", "0", "0: No Json, 1: Dump Results in Json format")
+        .insert("jsonfile", "multi_reduce_multiblock.json", "json file name to dump results");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+template <typename DataType>
+bool run(const ck_tile::ArgParser& arg_parser)
+{
+    using XDataType       = DataType;
+    using ComputeDataType = float;
+    using YDataType       = float;
+
+    ck_tile::index_t N = arg_parser.get_int("n");
+    ck_tile::index_t H = arg_parser.get_int("h");
+    ck_tile::index_t W = arg_parser.get_int("w");
+    ck_tile::index_t C = arg_parser.get_int("c");
+    int do_validation  = arg_parser.get_int("v");
+    int warmup         = arg_parser.get_int("warmup");
+    int repeat         = arg_parser.get_int("repeat");
+
+    // Validate input dimensions
+    const ck_tile::index_t kept_dim_len_prod   = N * C;
+    const ck_tile::index_t reduce_total_length = H * W;
+
+    if(kept_dim_len_prod == 0)
+    {
+        std::cerr << "Warning: Product of kept dimensions is zero (N=" << N << ", C=" << C
+                  << ", product=" << kept_dim_len_prod << ")." << std::endl;
+        std::cerr << "This will result in an empty output tensor." << std::endl;
+        return false;
+    }
+
+    if(reduce_total_length == 0)
+    {
+        std::cerr << "Warning: Product of reduce dimensions is zero (H=" << H << ", W=" << W
+                  << ", product=" << reduce_total_length << ")." << std::endl;
+        std::cerr << "This will result in an empty reduction with no data to process." << std::endl;
+        std::cerr << "The kernel will exit early without performing any computation." << std::endl;
+        return false;
+    }
+
+    std::vector<ck_tile::index_t> problem_shape = {N, H, W, C};
+    std::vector<ck_tile::index_t> strides(4);
+    strides[0] = H * W * C;
+    strides[1] = W * C;
+    strides[2] = C;
+    strides[3] = 1;
+
+    // Define reduction specification:
+    constexpr auto kept_dim    = ck_tile::sequence<0, 3>{}; // Which dimension to keep
+    constexpr auto reduce_dims = ck_tile::sequence<1, 2>{}; // Which dimensions to reduce
+
+    ck_tile::HostTensor<XDataType> x_host(problem_shape, strides);
+    ck_tile::HostTensor<YDataType> y_host_add_ref({N, C}, {C, 1});
+    ck_tile::HostTensor<YDataType> y_host_max_ref({N, C}, {C, 1});
+    auto y_host_ref_tuple = ck_tile::make_tuple(y_host_add_ref, y_host_max_ref);
+
+    ck_tile::HostTensor<YDataType> y_host_add_dev({N, C}, {C, 1});
+    ck_tile::HostTensor<YDataType> y_host_max_dev({N, C}, {C, 1});
+    auto y_host_dev_tuple = ck_tile::make_tuple(y_host_add_dev, y_host_max_dev);
+
+    const auto number_operations = y_host_dev_tuple.size();
+
+    std::vector<YDataType> h(number_operations * N * C);
+
+    auto y_buf_size = number_operations *
+                      y_host_dev_tuple.at(ck_tile::number<0>{}).get_element_space_size_in_bytes();
+    ck_tile::DeviceMem y_buf(y_buf_size);
+
+    const auto output_tensor_offset = N * C;
+
+    // Operations: one doing a sum reduction, the other computing the mean square
+    // In the case of mean square:
+    // 1. The element wise operation squares each element before reduction
+    // 2. The reduction operation sum the squared element
+    // 3. The accumulator element wise operation divides the result by the total number of reduced
+    // elements (intra block operation)
+    // 4. The partial result is updated across blocks using inter block reduction, a sum.
+    auto reduce_ops =
+        ck_tile::make_tuple(ck_tile::ReduceOp::Add{}, ck_tile::ReduceOp::Add{}); // reductions
+    auto elementwise_ops = ck_tile::make_tuple(ck_tile::element_wise::PassThrough{},
+                                               ck_tile::element_wise::UnarySquare{}); // Elementwise
+                                                                                      // ops
+    auto accumulator_elementwise_ops = ck_tile::make_tuple(
+        ck_tile::element_wise::PassThrough{},
+        ck_tile::element_wise::UnaryDivide{
+            reduce_total_length}); // Accumulator Elementwise ops on reduction, intra block
+    auto inter_block_reduce_ops = ck_tile::make_tuple(
+        ck_tile::ReduceOp::Add{}, ck_tile::ReduceOp::Add{}); // Inter block reduction
+
+    ck_tile::FillUniformDistribution<XDataType>{-5.f, 5.f}(x_host);
+
+    ck_tile::DeviceMem x_buf(x_host.get_element_space_size_in_bytes());
+
+    x_buf.ToDevice(x_host.data());
+
+    using BlockWarps = ck_tile::sequence<4, 1>;
+    using BlockTile  = ck_tile::sequence<128, 128>;
+    using WarpTile   = ck_tile::sequence<32, 128>;
+    using ThreadTile = ck_tile::sequence<8, 8>;
+
+    constexpr ck_tile::index_t kBlockPerCu = 1;
+
+    using Shape   = ck_tile::Reduce2dShape<BlockWarps, BlockTile, WarpTile, ThreadTile>;
+    using Problem = ck_tile::Reduce2dProblem<XDataType,
+                                             ComputeDataType,
+                                             YDataType,
+                                             Shape,
+                                             decltype(reduce_ops),
+                                             decltype(kept_dim),
+                                             decltype(reduce_dims),
+                                             4>;
+
+    using Kernel = ck_tile::MultiReduceMultiblock<Problem>;
+
+    // Determine block group size for multi-block reduction
+    // block_group_size records how many blocks participate to a reduction (input data dependent)
+    //  , for efficiency reasons this size if limited to a maximum of 128. If this is not sufficient
+    //  to process the whole reduction, each thread will to process multiple thread tile
+    //  a num_block_tile_iterations times
+    auto [num_block_tile_iterations, block_group_size] =
+        typename Kernel::TilePartitioner{reduce_total_length}.GetBlockGroupParams();
+
+    const ck_tile::index_t kBlockSize = Kernel::BlockSize();
+    ck_tile::index_t kGridSize =
+        ((kept_dim_len_prod + Shape::Block_M - 1) / Shape::Block_M) * block_group_size;
+
+    std::cout << "Block group size: " << block_group_size
+              << ", Num block tile iterations: " << num_block_tile_iterations
+              << ", Reduce total length: " << reduce_total_length << std::endl;
+    std::cout << "grid size " << kGridSize << ", block size " << kBlockSize << std::endl;
+
+    // Create input tensor shape and strides
+    auto input_shape =
+        ck_tile::make_tuple(problem_shape[0], problem_shape[1], problem_shape[2], problem_shape[3]);
+    auto input_strides = ck_tile::make_tuple(strides[0], strides[1], strides[2], strides[3]);
+
+    if(!Kernel::IsSupportedArgument(
+           C, input_strides)) // output tensor's continuous dimension and input strides
+    {
+        throw std::runtime_error("Wrong! Arguments not supported!\n");
+    }
+
+    // Init the output data with identity values respective to each reduce op
+    ck_tile::static_for<0, number_operations, 1>{}([&](auto i) {
+        constexpr auto op                 = reduce_ops.at(i);
+        const auto identity_val           = op.template GetIdentityValue<YDataType>();
+        const auto output_number_elements = N * C;
+        std::fill(h.begin() + i * output_number_elements,
+                  h.begin() + (i + 1) * output_number_elements,
+                  identity_val);
+    });
+
+    auto clear_output_buffer = [&]() { y_buf.ToDevice(h.data()); };
+
+    float ave_time = launch_kernel_time_mask(
+        ck_tile::stream_config{nullptr, true, 0, warmup, repeat},
+        clear_output_buffer,
+        ck_tile::make_kernel<kBlockPerCu>(Kernel{},
+                                          kGridSize,
+                                          kBlockSize,
+                                          0,
+                                          static_cast<XDataType*>(x_buf.GetDeviceBuffer()),
+                                          static_cast<YDataType*>(y_buf.GetDeviceBuffer()),
+                                          input_shape,
+                                          input_strides,
+                                          kept_dim,
+                                          reduce_dims,
+                                          output_tensor_offset,
+                                          elementwise_ops,
+                                          accumulator_elementwise_ops,
+                                          inter_block_reduce_ops)
+
+    );
+
+    std::size_t num_btype = sizeof(XDataType) * N * C * H * W + sizeof(YDataType) * N * C;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << gb_per_sec << " GB/s" << std::endl;
+
+    bool pass = true;
+
+    if(do_validation)
+    {
+        // reference
+        ck_tile::reference_multiple_reduce_multiblock<XDataType, ComputeDataType, YDataType>(
+            x_host,
+            y_host_ref_tuple,
+            reduce_ops,
+            kept_dim,
+            reduce_dims,
+            elementwise_ops,
+            accumulator_elementwise_ops,
+            inter_block_reduce_ops,
+            block_group_size);
+        std::cout << "Read " << y_buf_size / 10 << " Bytes from the device" << std::endl;
+
+        // Transfer data from device and check error for each operation
+        y_buf.FromDevice(h.data());
+        ck_tile::static_for<0, number_operations, 1>{}([&](auto i) {
+            std::memcpy(y_host_dev_tuple.get(ck_tile::number<i>{}).data(),
+                        h.data() + i * output_tensor_offset,
+                        output_tensor_offset * sizeof(YDataType));
+            std::cout << "Checking operation " << i << ": " << std::endl;
+
+            bool pass_op = ck_tile::check_err(y_host_dev_tuple.get(ck_tile::number<i>{}),
+                                              y_host_ref_tuple.get(ck_tile::number<i>{}));
+
+            if(pass_op)
+            {
+                std::cout << "✅ valid results for this operation" << std::endl;
+            }
+            pass &= pass_op;
+        });
+
+        std::cout << "valid:" << (pass ? "y" : "n") << std::flush << std::endl;
+    }
+
+    return pass;
+}
+
+int main(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    const std::string data_type = arg_parser.get_str("prec");
+
+    if(data_type == "fp16")
+    {
+        return run<ck_tile::half_t>(arg_parser) ? 0 : -2;
+    }
+}
--- a/example/ck_tile/05_reduce/multiple_reduce_threadwise.cpp
+++ b/example/ck_tile/05_reduce/multiple_reduce_threadwise.cpp
@@ -0,0 +1,224 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "ck_tile/host.hpp"
+#include "ck_tile/ops/reduce.hpp"
+#include "ck_tile/utility/json_dump.hpp"
+#include <cstring>
+
+template <typename T>
+struct DataTypeTraits;
+
+template <>
+struct DataTypeTraits<ck_tile::half_t>
+{
+    static constexpr const char* name = "fp16";
+};
+
+template <>
+struct DataTypeTraits<ck_tile::bf16_t>
+{
+    static constexpr const char* name = "bf16";
+};
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("n", "32", "n dimension")
+        .insert("h", "7", "h dimension")
+        .insert("w", "7", "w dimension")
+        .insert("c", "512", "c dimension")
+        .insert("v", "1", "cpu validation or not")
+        .insert("prec", "fp16", "precision")
+        .insert("warmup", "5", "cold iter")
+        .insert("repeat", "20", "hot iter")
+        .insert("json", "0", "0: No Json, 1: Dump Results in Json format")
+        .insert("jsonfile", "multi_reduce.json", "json file name to dump results");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+template <typename DataType>
+bool run(const ck_tile::ArgParser& arg_parser)
+{
+    using XDataType       = DataType;
+    using ComputeDataType = float;
+    using YDataType       = DataType;
+
+    ck_tile::index_t N = arg_parser.get_int("n");
+    ck_tile::index_t H = arg_parser.get_int("h");
+    ck_tile::index_t W = arg_parser.get_int("w");
+    ck_tile::index_t C = arg_parser.get_int("c");
+    int do_validation  = arg_parser.get_int("v");
+    int warmup         = arg_parser.get_int("warmup");
+    int repeat         = arg_parser.get_int("repeat");
+
+    // Validate input dimensions
+    const ck_tile::index_t kept_dim_len_prod   = N * C;
+    const ck_tile::index_t reduce_total_length = H * W;
+
+    if(kept_dim_len_prod == 0)
+    {
+        std::cerr << "Warning: Product of kept dimensions is zero (N=" << N << ", C=" << C
+                  << ", product=" << kept_dim_len_prod << ")." << std::endl;
+        std::cerr << "This will result in an empty output tensor." << std::endl;
+        return false;
+    }
+
+    if(reduce_total_length == 0)
+    {
+        std::cerr << "Warning: Product of reduce dimensions is zero (H=" << H << ", W=" << W
+                  << ", product=" << reduce_total_length << ")." << std::endl;
+        std::cerr << "This will result in an empty reduction with no data to process." << std::endl;
+        std::cerr << "The kernel will exit early without performing any computation." << std::endl;
+        return false;
+    }
+
+    std::vector<ck_tile::index_t> problem_shape = {N, H, W, C};
+    std::vector<ck_tile::index_t> strides(4);
+    strides[0] = H * W * C;
+    strides[1] = W * C;
+    strides[2] = C;
+    strides[3] = 1;
+
+    // Define reduction specification:
+    constexpr auto kept_dim    = ck_tile::sequence<0, 3>{}; // Which dimension to keep
+    constexpr auto reduce_dims = ck_tile::sequence<1, 2>{}; // Which dimensions to reduce
+
+    ck_tile::HostTensor<XDataType> x_host(problem_shape, strides);
+    ck_tile::HostTensor<YDataType> y_host_add_ref({N, C}, {C, 1});
+    ck_tile::HostTensor<YDataType> y_host_max_ref({N, C}, {C, 1});
+    auto y_host_ref_tuple = ck_tile::make_tuple(y_host_add_ref, y_host_max_ref);
+
+    ck_tile::HostTensor<YDataType> y_host_add_dev({N, C}, {C, 1});
+    ck_tile::HostTensor<YDataType> y_host_max_dev({N, C}, {C, 1});
+    auto y_host_dev_tuple = ck_tile::make_tuple(y_host_add_dev, y_host_max_dev);
+
+    const auto number_operations = y_host_dev_tuple.size();
+
+    // Two operations: one do a sum reduction, the other computing the mean square
+    auto reduce_ops =
+        ck_tile::make_tuple(ck_tile::ReduceOp::Add{}, ck_tile::ReduceOp::Add{}); // reductions ops
+    auto elementwise_ops =
+        ck_tile::make_tuple(ck_tile::element_wise::PassThrough{},
+                            ck_tile::element_wise::UnarySquare{}); // Elementwise ops
+    auto accumulator_elementwise_ops =
+        ck_tile::make_tuple(ck_tile::element_wise::PassThrough{},
+                            ck_tile::element_wise::UnaryDivide{
+                                reduce_total_length}); // Accumulator Elementiwise ops on reduction,
+
+    auto y_buf_size = number_operations *
+                      y_host_dev_tuple.at(ck_tile::number<0>{}).get_element_space_size_in_bytes();
+    ck_tile::DeviceMem y_buf(y_buf_size);
+
+    const auto output_tensor_offset = N * C;
+
+    ck_tile::FillUniformDistribution<XDataType>{-5.f, 5.f}(x_host);
+
+    ck_tile::DeviceMem x_buf(x_host.get_element_space_size_in_bytes());
+
+    x_buf.ToDevice(x_host.data());
+
+    using BlockWarps = ck_tile::sequence<4, 1>;
+    using BlockTile  = ck_tile::sequence<128, 128>;
+    using WarpTile   = ck_tile::sequence<32, 128>;
+    using ThreadTile = ck_tile::sequence<8, 8>;
+
+    constexpr ck_tile::index_t kBlockPerCu = 1;
+    ck_tile::index_t kGridSize = (kept_dim_len_prod + BlockTile::at(ck_tile::number<0>{}) - 1) /
+                                 BlockTile::at(ck_tile::number<0>{});
+    std::cout << "grid size " << kGridSize << std::endl;
+
+    using Shape   = ck_tile::Reduce2dShape<BlockWarps, BlockTile, WarpTile, ThreadTile>;
+    using Problem = ck_tile::Reduce2dProblem<XDataType,
+                                             ComputeDataType,
+                                             YDataType,
+                                             Shape,
+                                             decltype(reduce_ops),
+                                             decltype(kept_dim),
+                                             decltype(reduce_dims),
+                                             4>;
+
+    using Kernel                      = ck_tile::MultiReduceThreadWise<Problem>;
+    const ck_tile::index_t kBlockSize = Kernel::BlockSize();
+
+    // Create input tensor shape and strides
+    auto input_shape =
+        ck_tile::make_tuple(problem_shape[0], problem_shape[1], problem_shape[2], problem_shape[3]);
+    auto input_strides = ck_tile::make_tuple(strides[0], strides[1], strides[2], strides[3]);
+
+    if(!Kernel::IsSupportedArgument(
+           C, input_strides)) // output tensor's continuous dimension and input strides
+    {
+        throw std::runtime_error("Wrong! Arguments not supported!\n");
+    }
+
+    float ave_time = launch_kernel(
+        ck_tile::stream_config{nullptr, true, 0, warmup, repeat},
+        ck_tile::make_kernel<kBlockPerCu>(Kernel{},
+                                          kGridSize,
+                                          kBlockSize,
+                                          0,
+                                          static_cast<XDataType*>(x_buf.GetDeviceBuffer()),
+                                          static_cast<YDataType*>(y_buf.GetDeviceBuffer()),
+                                          input_shape,
+                                          input_strides,
+                                          kept_dim,
+                                          reduce_dims,
+                                          output_tensor_offset,
+                                          elementwise_ops,
+                                          accumulator_elementwise_ops));
+
+    std::size_t num_btype = sizeof(XDataType) * N * C * H * W + sizeof(YDataType) * N * C;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << gb_per_sec << " GB/s" << std::endl;
+
+    bool pass = true;
+
+    if(do_validation)
+    {
+        std::vector<YDataType> h(number_operations * N * C);
+
+        // reference
+        ck_tile::reference_multiple_reduce<XDataType, ComputeDataType, YDataType>(
+            x_host,
+            y_host_ref_tuple,
+            reduce_ops,
+            kept_dim,
+            reduce_dims,
+            elementwise_ops,
+            accumulator_elementwise_ops);
+        std::cout << "Read " << y_buf_size / 10 << " Bytes from the device" << std::endl;
+
+        // Transfer data from device and check error for each operation
+        y_buf.FromDevice(h.data());
+        ck_tile::static_for<0, number_operations, 1>{}([&](auto i) {
+            std::memcpy(y_host_dev_tuple.get(ck_tile::number<i>{}).data(),
+                        h.data() + i * output_tensor_offset,
+                        output_tensor_offset * sizeof(YDataType));
+            pass &= ck_tile::check_err(y_host_dev_tuple.get(ck_tile::number<i>{}),
+                                       y_host_ref_tuple.get(ck_tile::number<i>{}));
+        });
+
+        std::cout << "valid:" << (pass ? "y" : "n") << std::flush << std::endl;
+    }
+
+    return pass;
+}
+
+int main(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    const std::string data_type = arg_parser.get_str("prec");
+
+    if(data_type == "fp16")
+    {
+        return run<ck_tile::half_t>(arg_parser) ? 0 : -2;
+    }
+}
--- a/example/ck_tile/05_reduce/reduce.cpp
+++ b/example/ck_tile/05_reduce/reduce.cpp
@@ -0,0 +1,154 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "ck_tile/host.hpp"
+#include "ck_tile/ops/reduce.hpp"
+#include "ck_tile/utility/json_dump.hpp"
+#include <cstring>
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("n", "16", "n dimension")
+        .insert("h", "64", "h dimension")
+        .insert("w", "32", "w dimension")
+        .insert("c", "960", "c dimension")
+        .insert("v", "1", "cpu validation or not")
+        .insert("prec", "fp16", "precision")
+        .insert("warmup", "20", "cold iter")
+        .insert("repeat", "100", "hot iter")
+        .insert("json", "0", "0: No Json, 1: Dump Results in Json format")
+        .insert("jsonfile", "reduce.json", "json file name to dump results");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+template <typename DataType>
+bool run(const ck_tile::ArgParser& arg_parser)
+{
+    using XDataType       = DataType;
+    using ComputeDataType = float;
+    using YDataType       = DataType;
+
+    ck_tile::index_t N = arg_parser.get_int("n");
+    ck_tile::index_t H = arg_parser.get_int("h");
+    ck_tile::index_t W = arg_parser.get_int("w");
+    ck_tile::index_t C = arg_parser.get_int("c");
+    int do_validation  = arg_parser.get_int("v");
+    int warmup         = arg_parser.get_int("warmup");
+    int repeat         = arg_parser.get_int("repeat");
+
+    std::vector<ck_tile::index_t> problem_shape = {N, H, W, C};
+    std::vector<ck_tile::index_t> strides(4);
+    strides[0] = H * W * C;
+    strides[1] = W * C;
+    strides[2] = C;
+    strides[3] = 1;
+
+    // Define reduction specification:
+    constexpr auto kept_dim    = ck_tile::sequence<1, 2, 3>{}; // Which dimension to keep
+    constexpr auto reduce_dims = ck_tile::sequence<0>{};       // Which dimensions to reduce
+
+    ck_tile::HostTensor<XDataType> x_host(problem_shape, strides);
+    ck_tile::HostTensor<YDataType> y_host_ref({H, W, C}, {W * C, C, 1});
+    ck_tile::HostTensor<YDataType> y_host_dev({H, W, C}, {W * C, C, 1});
+
+    ck_tile::FillUniformDistribution<XDataType>{-5.f, 5.f}(x_host);
+
+    ck_tile::DeviceMem x_buf(x_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem y_buf(y_host_dev.get_element_space_size_in_bytes());
+
+    x_buf.ToDevice(x_host.data());
+
+    using ReduceOp   = ck_tile::ReduceOp::Add;
+    using BlockWarps = ck_tile::sequence<1, 1>;
+    using BlockTile  = ck_tile::sequence<256, 1>;
+    using WarpTile   = ck_tile::sequence<256, 1>;
+    using ThreadTile = ck_tile::sequence<1, 1>;
+
+    // cross warp-reduce
+    // using BlockWarps = ck_tile::sequence<2, 2>;
+    // using BlockTile  = ck_tile::sequence<2, 1024>;
+    // using WarpTile   = ck_tile::sequence<1, 512>;
+    // using ThreadTile = ck_tile::sequence<1, 8>;
+
+    constexpr ck_tile::index_t kBlockPerCu = 1;
+    ck_tile::index_t kept_dim_len_prod     = H * W * C;
+    ck_tile::index_t kGridSize = (kept_dim_len_prod + BlockTile::at(ck_tile::number<0>{}) - 1) /
+                                 BlockTile::at(ck_tile::number<0>{});
+    std::cout << "grid size " << kGridSize << std::endl;
+
+    using Shape   = ck_tile::Reduce2dShape<BlockWarps, BlockTile, WarpTile, ThreadTile>;
+    using Porblem = ck_tile::Reduce2dProblem<XDataType,
+                                             ComputeDataType,
+                                             YDataType,
+                                             Shape,
+                                             ReduceOp,
+                                             decltype(kept_dim),
+                                             decltype(reduce_dims),
+                                             4>;
+
+    using Kernel                      = ck_tile::ReduceKernel<Porblem>;
+    const ck_tile::index_t kBlockSize = Kernel::BlockSize();
+    // Create input tensor shape and strides
+    auto input_shape =
+        ck_tile::make_tuple(problem_shape[0], problem_shape[1], problem_shape[2], problem_shape[3]);
+    auto input_strides = ck_tile::make_tuple(strides[0], strides[1], strides[2], strides[3]);
+
+    float ave_time = launch_kernel(
+        ck_tile::stream_config{nullptr, true, 0, warmup, repeat},
+        ck_tile::make_kernel<kBlockPerCu>(Kernel{},
+                                          kGridSize,
+                                          kBlockSize,
+                                          0,
+                                          static_cast<XDataType*>(x_buf.GetDeviceBuffer()),
+                                          static_cast<YDataType*>(y_buf.GetDeviceBuffer()),
+                                          input_shape,
+                                          input_strides));
+
+    std::size_t num_btype = sizeof(XDataType) * N * H * W * C + sizeof(YDataType) * H * W * C;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << gb_per_sec << " GB/s" << std::endl;
+
+    bool pass = true;
+
+    if(do_validation)
+    {
+        // reference
+        ck_tile::reference_reduce<XDataType, ComputeDataType, YDataType>(
+            x_host, y_host_ref, ReduceOp{}, kept_dim, reduce_dims);
+        y_buf.FromDevice(y_host_dev.mData.data());
+        pass = ck_tile::check_err(y_host_dev, y_host_ref);
+
+        std::cout << "valid:" << (pass ? "y" : "n") << std::flush << std::endl;
+    }
+
+    if(arg_parser.get_int("json") == 1)
+    {
+        dump_reduce_json_results<DataType, ck_tile::DataTypeTraits>(
+            arg_parser.get_str("jsonfile"), N, C, H, W, pass, ave_time, 0, gb_per_sec);
+    }
+
+    return pass;
+}
+
+int main(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    const std::string data_type = arg_parser.get_str("prec");
+
+    if(data_type == "fp16")
+    {
+        return run<ck_tile::half_t>(arg_parser) ? 0 : -2;
+    }
+    else if(data_type == "bf16")
+    {
+        return run<ck_tile::bf16_t>(arg_parser) ? 0 : -2;
+    }
+}
--- a/example/ck_tile/06_permute/CMakeLists.txt
+++ b/example/ck_tile/06_permute/CMakeLists.txt
@@ -0,0 +1,16 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+# not using add_example_executable() to add this target, since we don't want this to have
+# to be included in "make all/install/check"
+add_executable(tile_example_permute permute.cpp)
+
+if(NOT DEFINED PERMUTE_USE_ALTERNATIVE_IMPL)
+# set(PERMUTE_USE_ALTERNATIVE_IMPL false)
+set(PERMUTE_USE_ALTERNATIVE_IMPL true)
+endif()
+if(PERMUTE_USE_ALTERNATIVE_IMPL)
+target_compile_options(tile_example_permute PRIVATE -DPERMUTE_USE_ALTERNATIVE_IMPL)
+target_sources(tile_example_permute PRIVATE alternative_impl/matrix_core_swizzle.cpp)
+endif()
+# target_compile_options(tile_example_permute PRIVATE -v --save-temps -Wno-gnu-line-marker)
--- a/example/ck_tile/06_permute/README.md
+++ b/example/ck_tile/06_permute/README.md
@@ -0,0 +1,94 @@
+# Permute with CK Tile
+
+This example demonstrates generic tensor permutation which is similiar to [torch.permute](https://pytorch.org/docs/stable/generated/torch.permute.html) (combined with [torch.contiguous](https://pytorch.org/docs/stable/generated/torch.Tensor.contiguous.html)). Currently we implement a generic permute kernel that support up to rank 8 arbitrary permutation with a single kernel instance. Performance is not the first consideration, we prefer a simple and general kernel implementation using `ck_tile` in this example.
+
+
+---
+
+## Algorithm and Math
+
+Given a tensor $X$ of shape $[d_0, d_1, ..., d_{n-1}]$ and a permutation $\pi$, compute:
+$$
+Y_{i_0, i_1, ..., i_{n-1}} = X_{i_{\pi(0)}, i_{\pi(1)}, ..., i_{\pi(n-1)}}
+$$
+
+- **Tilewise Permute**: Each thread block processes a tile (block) of the input, computes the permuted indices, and writes to the output.
+
+---
+
+## Tile Programming Model
+
+- **Tiles**: Each thread block processes a tile of the input tensor.
+- **Alternative Implementation**: For rank-7 tensors, a swizzled layout is supported for matrix core-friendly data loading.
+
+---
+
+## Build & Run
+
+### Arguments
+```
+args:
+          -v    weather do CPU validation or not (default:1)
+       -prec    data type. fp16/bf16/fp32 (default:fp16)
+      -shape    the shape of the input tensor (default:2,3,4)
+       -perm    permute perm (default:2,1,0)
+```
+```
+# in the root of ck_tile
+mkdir build && cd build
+../script/cmake-ck-dev.sh  ../ <arch>  # you can replace this <arch> to gfx90a, gfx942...
+make tile_example_permute -j
+```
+
+This will result in an executable `build/bin/tile_example_permute`
+
+
+### Further Examples
+
+```
+# torch
+x=torch.randn(2,3,4,6)
+y=x.permute(0,3,2,1).contiguous()
+
+# ck_tile
+./build/bin/tile_example_permute -shape=2,3,4,6 -perm=0,3,2,1
+```
+
+You can try the smoke_test:
+
+```
+# in the root of ck_tile, after you build this example
+sh example/ck_tile/06_permute/script/smoke_test.sh
+```
+
+### Alternative Implementation
+
+We have an alternative implementation under `alternative_impl/` folder, that can swizzle the tensor to be more friendly for data loading for matrix core layout. This can be enabled when dealing with a `rank-7` tensor, with a fixed pattern of either `0,1,4,2,5,3,6` or `0,1,2,4,5,3,6`. There are other shape limitation of this implementation, check the source code of `permute.cpp` for detail.
+
+```
+# example
+./build/bin/tile_example_permute -shape=3,6,4,32,16,2,8 -perm=0,1,4,2,5,3,6 # b_n0_k0_n1_k1_n2_k2
+./build/bin/tile_example_permute -shape=3,8,4,16,16,4,8 -perm=0,1,2,4,5,3,6 # b_n0_n1_k0_k1_n2_k2
+```
+
+---
+
+## Source Structure
+
+- **Kernel**: `permute.hpp` (tile-programming kernel template)
+- **Executable**: `permute.cpp` (argument parsing, kernel launch)
+- **Alternative**: `alternative_impl/` (swizzled layout for rank-7 tensors)
+- **Build**: `CMakeLists.txt`, `script/`
+
+---
+
+## Related CK Tile Examples
+
+- [03_gemm](../03_gemm/README.md): GEMM with tiles
+- [05_reduce](../05_reduce/README.md): Reductions with tiles
+- [35_batched_transpose](../35_batched_transpose/README.md): Batched transpose with tiles
+
+For distribution, `include/ck_tile/tile_program/tile_distribution/`.
+
+---
+[Back to CK Tile Examples](../README.md)
--- a/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle.cpp
+++ b/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle.cpp
@@ -0,0 +1,101 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "matrix_core_swizzle.hpp"
+#include "matrix_core_swizzle_kernel.hpp"
+
+float matrix_core_swizzle(matrix_core_swizzle_traits t,
+                          matrix_core_swizzle_args a,
+                          const ck_tile::stream_config& s)
+{
+    if(t.data_type.compare("fp16") == 0)
+    {
+        if(t.inst.compare("32x32x8") == 0)
+        {
+            constexpr int BLOCK_SIZE             = 256;
+            constexpr int NPerBlock              = 256;
+            constexpr int KPerBlock              = 128;
+            constexpr matrix_core_inst_enum Inst = matrix_core_inst_enum::MFMA_32x32x8_F16;
+            if(t.permute.compare("0,1,4,2,5,3,6") == 0)
+            {
+                constexpr matrix_core_permute_style pstyle =
+                    matrix_core_permute_style::permute_b_n0_k0_n1_k1_n2_k2;
+                using Kernel =
+                    matrix_core_swizzle_kernel<BLOCK_SIZE, NPerBlock, KPerBlock, pstyle, Inst>;
+
+                auto k         = Kernel(a);
+                float ave_time = ck_tile::launch_kernel(s, k);
+
+                return ave_time;
+            }
+            else if(t.permute.compare("0,1,2,4,5,3,6") == 0)
+            {
+                constexpr matrix_core_permute_style pstyle =
+                    matrix_core_permute_style::permute_b_n0_n1_k0_k1_n2_k2;
+                using Kernel =
+                    matrix_core_swizzle_kernel<BLOCK_SIZE, NPerBlock, KPerBlock, pstyle, Inst>;
+
+                auto k         = Kernel(a);
+                float ave_time = ck_tile::launch_kernel(s, k);
+
+                return ave_time;
+            }
+            else if(t.permute.compare("0,1,3,4,2,5") == 0)
+            {
+                constexpr matrix_core_permute_style pstyle =
+                    matrix_core_permute_style::b_nr_kr_kw_nw_kv;
+                using Kernel =
+                    matrix_core_swizzle_kernel<BLOCK_SIZE, NPerBlock, KPerBlock, pstyle, Inst>;
+
+                auto k         = Kernel(a);
+                float ave_time = ck_tile::launch_kernel(s, k);
+
+                return ave_time;
+            }
+        }
+        else if(t.inst.compare("16x16x16") == 0)
+        {
+            constexpr int BLOCK_SIZE             = 256;
+            constexpr int NPerBlock              = 256;
+            constexpr int KPerBlock              = 128;
+            constexpr matrix_core_inst_enum Inst = matrix_core_inst_enum::MFMA_16x16x16_F16;
+            if(t.permute.compare("0,1,4,2,5,3,6") == 0)
+            {
+                constexpr matrix_core_permute_style pstyle =
+                    matrix_core_permute_style::permute_b_n0_k0_n1_k1_n2_k2;
+                using Kernel =
+                    matrix_core_swizzle_kernel<BLOCK_SIZE, NPerBlock, KPerBlock, pstyle, Inst>;
+
+                auto k         = Kernel(a);
+                float ave_time = ck_tile::launch_kernel(s, k);
+
+                return ave_time;
+            }
+            else if(t.permute.compare("0,1,2,4,5,3,6") == 0)
+            {
+                constexpr matrix_core_permute_style pstyle =
+                    matrix_core_permute_style::permute_b_n0_n1_k0_k1_n2_k2;
+                using Kernel =
+                    matrix_core_swizzle_kernel<BLOCK_SIZE, NPerBlock, KPerBlock, pstyle, Inst>;
+
+                auto k         = Kernel(a);
+                float ave_time = ck_tile::launch_kernel(s, k);
+
+                return ave_time;
+            }
+            else if(t.permute.compare("0,1,3,4,2,5") == 0)
+            {
+                constexpr matrix_core_permute_style pstyle =
+                    matrix_core_permute_style::b_nr_kr_kw_nw_kv;
+                using Kernel =
+                    matrix_core_swizzle_kernel<BLOCK_SIZE, NPerBlock, KPerBlock, pstyle, Inst>;
+
+                auto k         = Kernel(a);
+                float ave_time = ck_tile::launch_kernel(s, k);
+
+                return ave_time;
+            }
+        }
+    }
+    return -1;
+}
--- a/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle.hpp
+++ b/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle.hpp
@@ -0,0 +1,20 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+#include "matrix_core_swizzle_kernel.hpp"
+#include <string>
+
+struct matrix_core_swizzle_traits
+{
+    std::string data_type; // fp16 only
+    std::string inst;      // 32x32x8, 16x16x16
+    std::string permute;   //
+};
+
+using matrix_core_swizzle_args = matrix_core_swizzle_host_args;
+
+// host API
+float matrix_core_swizzle(matrix_core_swizzle_traits,
+                          matrix_core_swizzle_args,
+                          const ck_tile::stream_config&);
--- a/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle_kernel.hpp
+++ b/example/ck_tile/06_permute/alternative_impl/matrix_core_swizzle_kernel.hpp
@@ -0,0 +1,413 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/ops/gemm.hpp"
+
+// if set to 1, slightly more instructions generated to calculate address
+#ifndef MERGE_2D_013425
+#define MERGE_2D_013425 0
+#endif
+
+enum class matrix_core_inst_enum
+{
+    MFMA_32x32x8_F16  = 0,
+    MFMA_16x16x16_F16 = 1,
+};
+
+namespace detail {
+template <matrix_core_inst_enum>
+struct to_warp_gemm;
+
+template <>
+struct to_warp_gemm<matrix_core_inst_enum::MFMA_32x32x8_F16>
+{
+    using type = ck_tile::WarpGemmMfmaF16F16F32M32N32K8;
+};
+
+template <>
+struct to_warp_gemm<matrix_core_inst_enum::MFMA_16x16x16_F16>
+{
+    using type = ck_tile::WarpGemmMfmaF16F16F32M16N16K16;
+};
+} // namespace detail
+template <matrix_core_inst_enum Inst>
+using to_warp_gemm_t = typename detail::to_warp_gemm<Inst>::type;
+
+// TODO: in below permute pattern, the last 3 dim is within wave
+enum class matrix_core_permute_style
+{
+    permute_b_n0_k0_n1_k1_n2_k2 = 0, // 0,1,4,2,5,3,6
+    permute_b_n0_n1_k0_k1_n2_k2 = 1, // 0,1,2,4,5,3,6
+    b_nr_kr_kw_nw_kv            = 2, // 0,1,3,4,2,5
+    b_nr_kr_waveflatten         = b_nr_kr_kw_nw_kv,
+};
+
+// assume this is B matrix, originally we have batch*n*k
+// now batch* n0*n1*n2*k0*k1*k2 -> batch* n0*k0*n1*k1*n2*k2
+// assume using 32x32x8-f16, 4 waves and extend the KPerLane to 8xfp16(dwordx4)
+//
+//                                      4(waves)  32(mfma_m lane)
+//                                          |      |
+// batch* n0*n1*n2*k0*k1*k2 -> batch* n0*k0*n1*k1*n2*k2 -> 8(thread loading)
+//                                    nr  kr    |
+//        nr  4  32 kr 2  8                     2(klane)
+//
+// permute: 0,1,4,2,5,3,6
+// or
+// batch* n0*n1*n2*k0*k1*k2 -> batch* n0*n1*k0*k1*n2*k2 -> 8(thread loading)
+// permute: 0,1,2,4,5,3,6
+//
+// this kernel only deal with fp16/bf16 data(16bit), and use 2d block size to do the swizzling
+// for simplicity, only consider n/k is multiple of block-size
+
+// independend host arg with no template
+struct matrix_core_swizzle_host_args
+{
+    const void* p_src;
+    void* p_dst;
+    int32_t batch;
+    int32_t n;
+    int32_t k;
+};
+
+// NOTE: this kernel could follow the style of generic permute kernel
+// but here we pass in fixed layout as template arg and generate different kernel instance
+// purposely
+template <int BLOCK_SIZE_ = 256,
+          int NPerBlock_  = 256,
+          int KPerBlock_  = 128,
+          matrix_core_permute_style pstyle_ =
+              matrix_core_permute_style::permute_b_n0_k0_n1_k1_n2_k2,
+          matrix_core_inst_enum Inst_ = matrix_core_inst_enum::MFMA_32x32x8_F16>
+struct matrix_core_swizzle_kernel
+{
+    using karg = matrix_core_swizzle_host_args;
+    using harg = matrix_core_swizzle_host_args;
+
+    static constexpr int BLOCK_SIZE                   = BLOCK_SIZE_;
+    static constexpr int WavesPerBlock_N              = BLOCK_SIZE / ck_tile::get_warp_size();
+    static constexpr int WavesPerBlock_K              = 1;
+    static constexpr int NPerBlock                    = NPerBlock_;
+    static constexpr int KPerBlock                    = KPerBlock_;
+    static constexpr matrix_core_permute_style pstyle = pstyle_;
+    static constexpr matrix_core_inst_enum Inst       = Inst_;
+
+    static constexpr ck_tile::index_t Alignment = 8;
+    karg a;
+    dim3 grids;
+
+    using WarpGemm = to_warp_gemm_t<Inst>;
+
+    __host__ matrix_core_swizzle_kernel(harg h)
+    {
+        a                   = h;
+        ck_tile::index_t ns = (h.n + NPerBlock - 1) / NPerBlock;
+        ck_tile::index_t ks = (h.k + KPerBlock - 1) / KPerBlock;
+        grids               = dim3(ks, ns, h.batch);
+    }
+
+    __host__ bool is_applicable(harg h) { return h.n % NPerBlock == 0 && h.k % KPerBlock == 0; }
+
+    __host__ void operator()(const ck_tile::stream_config& s) const
+    {
+        ck_tile::kentry<1, kernel><<<grids, BLOCK_SIZE, 0, s.stream_id_>>>(a);
+    }
+
+    struct kernel
+    {
+        static constexpr int kBlockSize = BLOCK_SIZE;
+        __device__ static constexpr auto get_src_dist()
+        {
+            using namespace ck_tile;
+            constexpr index_t K2 = Alignment;
+            constexpr index_t N2 = WarpGemm::WarpGemmAttribute::Impl::kAMLane;
+            constexpr index_t K1 = WarpGemm::WarpGemmAttribute::Impl::kABKLane;
+            constexpr index_t N1 = BLOCK_SIZE / get_warp_size();
+
+            static_assert(NPerBlock % (N1 * N2) == 0);
+            static_assert(KPerBlock % (K1 * K2) == 0);
+
+            constexpr index_t K0 = KPerBlock / (K1 * K2);
+            constexpr index_t N0 = NPerBlock / (N1 * N2);
+
+            // clang-format off
+            return make_static_tile_distribution(
+                tile_distribution_encoding<
+                    sequence<1>,// 0
+                    //             1              2            3             4             5             6
+                    tuple<sequence<N0>, sequence<N1>, sequence<N2>, sequence<K0>, sequence<K1>, sequence<K2>>,
+
+                    //            N1           K1  N2
+                    tuple<sequence<2>, sequence<5, 3>>,
+                    tuple<sequence<0>, sequence<0, 0>>,
+
+                    //       N0 K0 K2
+                    sequence<1, 4, 6>,
+                    sequence<0, 0, 0>>{});
+            // clang-format on
+        }
+        __device__ static constexpr auto get_dst_dist()
+        {
+            using namespace ck_tile;
+            constexpr index_t K2 = Alignment;
+            constexpr index_t N2 = WarpGemm::WarpGemmAttribute::Impl::kAMLane;
+            constexpr index_t K1 = WarpGemm::WarpGemmAttribute::Impl::kABKLane;
+            constexpr index_t N1 = BLOCK_SIZE / get_warp_size();
+
+            static_assert(NPerBlock % (N1 * N2) == 0);
+            static_assert(KPerBlock % (K1 * K2) == 0);
+
+            constexpr index_t K0 = KPerBlock / (K1 * K2);
+            constexpr index_t N0 = NPerBlock / (N1 * N2);
+
+            if constexpr(pstyle == matrix_core_permute_style::permute_b_n0_k0_n1_k1_n2_k2)
+            {
+                // clang-format off
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<
+                        sequence<1>,// 0
+                        //             1              2            3             4             5             6
+                        tuple<sequence<N0>, sequence<K0>, sequence<N1>, sequence<K1>, sequence<N2>, sequence<K2>>,
+
+                        //            N1           K1  N2
+                        tuple<sequence<3>, sequence<4, 5>>,
+                        tuple<sequence<0>, sequence<0, 0>>,
+
+                        //       N0 K0 K2
+                        sequence<1, 2, 6>,
+                        sequence<0, 0, 0>>{});
+                // clang-format on
+            }
+            else if constexpr(pstyle == matrix_core_permute_style::permute_b_n0_n1_k0_k1_n2_k2)
+            {
+                // clang-format off
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<
+                        sequence<1>,// 0
+                        //             1              2            3             4             5             6
+                        tuple<sequence<N0>, sequence<N1>, sequence<K0>, sequence<K1>, sequence<N2>, sequence<K2>>,
+
+                        //            N1           K1  N2
+                        tuple<sequence<2>, sequence<4, 5>>,
+                        tuple<sequence<0>, sequence<0, 0>>,
+
+                        //       N0 K0 K2
+                        sequence<1, 3, 6>,
+                        sequence<0, 0, 0>>{});
+                // clang-format on
+            }
+            else
+            {
+                // clang-format off
+                // b_nr_kr_kw_nw_kv or b_nr_kr_waveflatten
+                constexpr index_t Kv = Alignment;
+                constexpr index_t Nw = WarpGemm::WarpGemmAttribute::Impl::kAMLane;
+                constexpr index_t Kw = WarpGemm::WarpGemmAttribute::Impl::kABKLane;
+
+                static_assert(KPerBlock % (K1 * K2) == 0);
+                constexpr index_t Nr = NPerBlock / Nw;
+                constexpr index_t Kr = KPerBlock / (Kv * Kw);
+
+                constexpr index_t Nr_p = WavesPerBlock_N;
+                constexpr index_t Kr_p = WavesPerBlock_K;
+                constexpr index_t Nr_y = Nr / Nr_p;
+                constexpr index_t Kr_y = Kr / Kr_p;
+
+                return make_static_tile_distribution(
+#if MERGE_2D_013425
+                    tile_distribution_encoding<
+                        sequence<1>,// 0    R
+                        // major       1                         2
+                        // minor       0     1     2             0     1     2   3
+                        tuple<sequence<Nr_y, Nr_p, Nw>, sequence<Kr_y, Kr_p, Kw, Kv>>,    // H
+
+                        //            Nr_p, Kr_p         Kw Nw
+                        tuple<sequence<1  , 2>, sequence<2, 1>>,    // p major
+                        tuple<sequence<1  , 1>, sequence<2, 2>>,    // p minor
+
+                        //       Nr_y Kr_y Kv
+                        sequence<1,   2,   2>,          // Y major
+                        sequence<0,   0,   3>>{});      // y minor
+#else
+                    tile_distribution_encoding<
+                        sequence<1>,// 0    R
+                        // major       1                     2                     3
+                        // minor       0     1               0     1               0   1   2
+                        tuple<sequence<Nr_y, Nr_p>, sequence<Kr_y, Kr_p>, sequence<Kw, Nw, Kv>>,    // H
+
+                        //            Nr_p, Kr_p         Kw Nw
+                        tuple<sequence<1  , 2>, sequence<3, 3>>,    // p major
+                        tuple<sequence<1  , 1>, sequence<0, 1>>,    // p minor
+
+                        //       Nr_y Kr_y Kv
+                        sequence<1,   2,   3>,          // Y major
+                        sequence<0,   0,   2>>{});      // y minor
+#endif
+                // clang-format on
+            }
+        }
+
+        __device__ void operator()(karg a_)
+        {
+            using namespace ck_tile;
+            index_t i_k = blockIdx.x;
+            index_t i_n = blockIdx.y;
+            index_t i_b = blockIdx.z;
+
+            constexpr index_t k2 = Alignment;
+            constexpr index_t n2 = WarpGemm::WarpGemmAttribute::Impl::kAMLane;
+            constexpr index_t k1 = WarpGemm::WarpGemmAttribute::Impl::kABKLane;
+            constexpr index_t n1 = BLOCK_SIZE / get_warp_size();
+            const index_t k0     = a_.k / (k1 * k2);
+            const index_t n0     = a_.n / (n1 * n2);
+
+            constexpr index_t k2_tile = Alignment;
+            constexpr index_t n2_tile = WarpGemm::WarpGemmAttribute::Impl::kAMLane;
+            constexpr index_t k1_tile = WarpGemm::WarpGemmAttribute::Impl::kABKLane;
+            constexpr index_t n1_tile = BLOCK_SIZE / get_warp_size();
+            constexpr index_t k0_tile = KPerBlock / (k1_tile * k2_tile);
+            constexpr index_t n0_tile = NPerBlock / (n1_tile * n2_tile);
+
+            const fp16_t* p_src = reinterpret_cast<const fp16_t*>(a_.p_src) + i_b * a_.k * a_.n;
+            fp16_t* p_dst       = reinterpret_cast<fp16_t*>(a_.p_dst) + i_b * a_.k * a_.n;
+
+            const auto src_view = [&]() {
+                const auto tmp = make_naive_tensor_view_packed<address_space_enum::global>(
+                    p_src,
+                    make_tuple(n0, n1, n2, k0, k1, k2),
+                    number<Alignment>{}); // control vector load
+                return tmp;
+            }();
+
+            const auto src_window = make_tile_window(src_view,
+                                                     make_tuple(number<n0_tile>{},
+                                                                number<n1_tile>{},
+                                                                number<n2_tile>{},
+                                                                number<k0_tile>{},
+                                                                number<k1_tile>{},
+                                                                number<k2_tile>{}),
+                                                     {i_n * n0_tile, 0, 0, i_k * k0_tile, 0, 0},
+                                                     get_src_dist());
+
+            auto dst_view = [&]() {
+                if constexpr(pstyle == matrix_core_permute_style::permute_b_n0_k0_n1_k1_n2_k2)
+                {
+                    auto tmp = make_naive_tensor_view_packed<address_space_enum::global>(
+                        p_dst,
+                        make_tuple(n0, k0, n1, k1, n2, k2),
+                        number<Alignment>{}); // control vector load
+                    return tmp;
+                }
+                else if constexpr(pstyle == matrix_core_permute_style::permute_b_n0_n1_k0_k1_n2_k2)
+                {
+                    auto tmp = make_naive_tensor_view_packed<address_space_enum::global>(
+                        p_dst,
+                        make_tuple(n0, n1, k0, k1, n2, k2),
+                        number<Alignment>{}); // control vector load
+                    return tmp;
+                }
+                else
+                {
+#if MERGE_2D_013425
+                    constexpr index_t kv = Alignment;
+                    constexpr index_t nw = WarpGemm::WarpGemmAttribute::Impl::kAMLane;
+                    constexpr index_t kw = WarpGemm::WarpGemmAttribute::Impl::kABKLane;
+                    // constexpr index_t waveflatten = kw*nw*kv;
+                    const index_t kr = a_.k / (k1 * k2);
+                    const index_t nr = a_.n / nw;
+                    auto tmp         = make_naive_tensor_view_packed<address_space_enum::global>(
+                        p_dst,
+                        make_tuple(nr, kr, number<kw>{}, number<nw>{}, number<kv>{}),
+                        number<Alignment>{}); // control vector load
+                    auto tmp_1 = transform_tensor_view(
+                        tmp,
+                        make_tuple(
+                            make_merge_transform(make_tuple(nr, number<nw>{})),
+                            make_merge_transform(make_tuple(kr, number<kw>{}, number<kv>{}))),
+                        make_tuple(sequence<0, 3>{}, sequence<1, 2, 4>{}),
+                        make_tuple(sequence<0>{}, sequence<1>{}));
+                    return tmp_1;
+#else
+                    // b_nr_kr_waveflatten = b_nr_kr_kw_nw_kv,
+                    constexpr index_t kv          = Alignment;
+                    constexpr index_t nw          = WarpGemm::WarpGemmAttribute::Impl::kAMLane;
+                    constexpr index_t kw          = WarpGemm::WarpGemmAttribute::Impl::kABKLane;
+                    constexpr index_t waveflatten = kw * nw * kv;
+                    const index_t kr              = a_.k / (k1 * k2);
+                    const index_t nr              = a_.n / nw;
+                    auto tmp = make_naive_tensor_view_packed<address_space_enum::global>(
+                        p_dst,
+                        make_tuple(nr, kr, waveflatten),
+                        number<Alignment>{}); // control vector load
+                    return tmp;
+#endif
+                }
+            }();
+
+            auto dst_window = [&]() {
+                if constexpr(pstyle == matrix_core_permute_style::permute_b_n0_k0_n1_k1_n2_k2)
+                {
+                    return make_tile_window(dst_view,
+                                            make_tuple(number<n0_tile>{},
+                                                       number<k0_tile>{},
+                                                       number<n1_tile>{},
+                                                       number<k1_tile>{},
+                                                       number<n2_tile>{},
+                                                       number<k2_tile>{}),
+                                            {i_n * n0_tile, i_k * k0_tile, 0, 0, 0, 0},
+                                            get_dst_dist());
+                }
+                else if constexpr(pstyle == matrix_core_permute_style::permute_b_n0_n1_k0_k1_n2_k2)
+                {
+                    return make_tile_window(dst_view,
+                                            make_tuple(number<n0_tile>{},
+                                                       number<n1_tile>{},
+                                                       number<k0_tile>{},
+                                                       number<k1_tile>{},
+                                                       number<n2_tile>{},
+                                                       number<k2_tile>{}),
+                                            {i_n * n0_tile, 0, i_k * k0_tile, 0, 0, 0},
+                                            get_dst_dist());
+                }
+                else
+                {
+#if MERGE_2D_013425
+                    // b_nr_kr_waveflatten = b_nr_kr_kw_nw_kv
+                    return make_tile_window(dst_view,
+                                            make_tuple(number<NPerBlock>{}, number<KPerBlock>{}),
+                                            {i_n * NPerBlock, i_k * KPerBlock},
+                                            get_dst_dist());
+#else
+                    // b_nr_kr_waveflatten = b_nr_kr_kw_nw_kv
+                    constexpr index_t kv = Alignment;
+                    constexpr index_t nw = WarpGemm::WarpGemmAttribute::Impl::kAMLane;
+                    constexpr index_t kw = WarpGemm::WarpGemmAttribute::Impl::kABKLane;
+                    constexpr index_t waveflatten_tile = kw * nw * kv;
+                    constexpr index_t nr_tile          = NPerBlock / nw;
+                    constexpr index_t kr_tile          = KPerBlock / (kw * kv);
+                    return make_tile_window(dst_view,
+                                            make_tuple(number<nr_tile>{},
+                                                       number<kr_tile>{},
+                                                       number<waveflatten_tile>{}),
+                                            {i_n * nr_tile, i_k * kr_tile, 0},
+                                            get_dst_dist());
+#endif
+                }
+            }();
+
+            // actual load store
+            auto src_tile = load_tile(src_window);
+
+            // now we only swap the distribution from src to dst, no extra movement occurs
+            auto dst_tile                = make_static_distributed_tensor<fp16_t>(get_dst_dist());
+            dst_tile.get_thread_buffer() = src_tile.get_thread_buffer();
+
+            // final store
+            store_tile(dst_window, dst_tile);
+        }
+    };
+};
--- a/example/ck_tile/06_permute/permute.cpp
+++ b/example/ck_tile/06_permute/permute.cpp
@@ -0,0 +1,421 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "permute.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/utility/json_dump.hpp"
+
+#include <array>
+#include <cstring>
+#include <functional>
+#include <numeric>
+#include <ostream>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#ifdef PERMUTE_USE_ALTERNATIVE_IMPL
+#include "alternative_impl/matrix_core_swizzle.hpp"
+#endif
+
+namespace detail {
+template <int bytes>
+struct to_integer_type;
+
+template <>
+struct to_integer_type<4>
+{
+    using type = int32_t;
+};
+template <>
+struct to_integer_type<2>
+{
+    using type = int16_t;
+};
+template <>
+struct to_integer_type<1>
+{
+    using type = int8_t;
+};
+} // namespace detail
+
+template <int bytes>
+using to_integer_type = typename detail::to_integer_type<bytes>::type;
+
+// host API (shoule come from codegen)
+float permute(permute_traits t, permute_args a, const ck_tile::stream_config& s)
+{
+    if(t.data_type.compare("fp8") == 0)
+    {
+        using DataType        = ck_tile::fp8_t;
+        using PipelineProblem = ck_tile::GenericPermuteProblem<DataType>;
+        using Kernel          = ck_tile::GenericPermute<PipelineProblem>;
+
+        auto kargs = Kernel::MakeKargs(a);
+
+        const dim3 grids  = Kernel::GridSize(a);
+        const dim3 blocks = Kernel::BlockSize();
+
+        float ave_time =
+            ck_tile::launch_kernel(s, ck_tile::make_kernel<1>(Kernel{}, grids, blocks, 0, kargs));
+
+        return ave_time;
+    }
+    else if(t.data_type.compare("fp16") == 0)
+    {
+        using DataType        = ck_tile::half_t;
+        using PipelineProblem = ck_tile::GenericPermuteProblem<DataType>;
+        using Kernel          = ck_tile::GenericPermute<PipelineProblem>;
+
+        auto kargs = Kernel::MakeKargs(a);
+
+        const dim3 grids  = Kernel::GridSize(a);
+        const dim3 blocks = Kernel::BlockSize();
+
+        float ave_time =
+            ck_tile::launch_kernel(s, ck_tile::make_kernel<1>(Kernel{}, grids, blocks, 0, kargs));
+
+        return ave_time;
+    }
+    else if(t.data_type.compare("fp32") == 0)
+    {
+        using DataType        = float;
+        using PipelineProblem = ck_tile::GenericPermuteProblem<DataType>;
+        using Kernel          = ck_tile::GenericPermute<PipelineProblem>;
+
+        auto kargs = Kernel::MakeKargs(a);
+
+        const dim3 grids  = Kernel::GridSize(a);
+        const dim3 blocks = Kernel::BlockSize();
+
+        float ave_time =
+            ck_tile::launch_kernel(s, ck_tile::make_kernel<1>(Kernel{}, grids, blocks, 0, kargs));
+
+        return ave_time;
+    }
+
+    return 0;
+}
+
+template <typename T>
+std::ostream& operator<<(std::ostream& os, const std::vector<T>& v)
+{
+    using size_type = typename std::vector<T>::size_type;
+
+    os << "[";
+    for(size_type idx = 0; idx < v.size(); ++idx)
+    {
+        if(0 < idx)
+        {
+            os << ", ";
+        }
+        os << v[idx];
+    }
+    return os << "]";
+}
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("v", "1", "weather do CPU validation or not")
+        .insert("prec", "fp16", "data type. fp8/fp16/fp32 (representing 8/16/32 bit data)")
+        .insert("shape", "2,3,4", "the shape of the input tensor")
+        .insert("perm", "2,1,0", "permute perm")
+        .insert("kname", "0", "t to 1 will print kernel name")
+        .insert("seed",
+                "11939",
+                "random seed used for initializing input tensors. 0 for "
+                "non-deterministic seed")
+        .insert("warmup", "5", "number of iterations before benchmark the kernel")
+        .insert("repeat", "20", "number of iterations to benchmark the kernel")
+        .insert("json", "0", "0: No Json, 1: Dump Results in Json format")
+        .insert("jsonfile", "permute.json", "json file name to dump results");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+// different threshold for different dtype
+template <typename DataType>
+auto get_elimit(std::string /*init_method*/)
+{
+    double rtol = 1e-3;
+    double atol = 1e-3;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<ck_tile::bf16_t>(std::string /*init_method*/)
+{
+    double rtol = 1e-2;
+    double atol = 1e-2;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<ck_tile::fp8_t>(std::string init_method)
+{
+    if(init_method == "ui" || init_method == "ni")
+    {
+        unsigned max_rounding_point_distance = 0;
+        double atol                          = 2e-3;
+        return ck_tile::make_tuple(max_rounding_point_distance, atol);
+    }
+    else
+    {
+        unsigned max_rounding_point_distance = 1;
+        double atol                          = 0.0625;
+        return ck_tile::make_tuple(max_rounding_point_distance, atol);
+    }
+}
+
+// "1,2,3,4" -> vector{1,2,3,4}
+std::vector<ck_tile::index_t> decode_vec(std::string q_val)
+{
+#define _S2I_(str_) static_cast<ck_tile::index_t>(std::atoi((str_).c_str()))
+    std::string::size_type pos = 0;
+    std::vector<ck_tile::index_t> v;
+    while(true)
+    {
+        auto found = q_val.find(',', pos);
+        ck_tile::index_t n =
+            _S2I_(q_val.substr(pos, found == std::string::npos ? found : found - pos));
+        v.push_back(n);
+        if(found == std::string::npos)
+        {
+            break;
+        }
+        pos = found + 1;
+    }
+    return v;
+#undef _S2I_
+}
+
+template <typename DataType>
+bool run(const ck_tile::ArgParser& arg_parser)
+{
+    std::string data_type = arg_parser.get_str("prec");
+    int do_validation     = arg_parser.get_int("v");
+
+    auto shape        = decode_vec(arg_parser.get_str("shape"));
+    auto perm         = decode_vec(arg_parser.get_str("perm"));
+    int stream_warmup = arg_parser.get_int("warmup");
+    int stream_repeat = arg_parser.get_int("repeat");
+    bool kname        = arg_parser.get_bool("kname");
+    int seed          = arg_parser.get_int("seed");
+
+    assert(shape.size() == perm.size());
+    ck_tile::index_t rank = perm.size();
+    if(rank > ck_tile::GenericPermuteHostArgs::kMaxRanks)
+    {
+        printf("rank %d permute is not support yet\n", rank);
+        return false;
+    }
+
+    ck_tile::HostTensor<DataType> x(shape);
+    ck_tile::FillUniformDistributionIntegerValue<DataType>{-15, 15, seed}(x);
+
+    std::vector<ck_tile::index_t> y_shape = [&]() {
+        std::vector<ck_tile::index_t> tmp(rank, 0);
+        // std::cout << "@@@@" << tmp << std::endl;
+        for(int i = 0; i < static_cast<int>(rank); i++)
+        {
+            // std::cout << "  i:" << i << ", perm:" << perm[i] << ", rak:" <<
+            // static_cast<int>(rank)
+            // << std::endl;
+            tmp[i] = shape[perm[i]];
+        }
+        // std::cout << "@@@" << tmp << std::endl;
+        return tmp;
+    }();
+
+    ck_tile::HostTensor<DataType> y(y_shape);
+
+    ck_tile::DeviceMem x_buf(x.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem y_buf(y.get_element_space_size_in_bytes());
+
+    x_buf.ToDevice(x.data());
+
+    std::cout << "[" << data_type << "] shape:" << shape << "->" << y_shape << ", permute:" << perm
+              << std::flush;
+
+    ck_tile::stream_config stream_config{nullptr,
+                                         true,
+                                         /* log_level = */ (kname ? 1 : 0),
+                                         stream_warmup,
+                                         stream_repeat};
+    float ave_time   = 0.f;
+    auto run_permute = [&]() {
+        permute_traits t;
+        t.data_type = data_type;
+
+        permute_args a;
+        a.p_src = x_buf.GetDeviceBuffer();
+        a.p_dst = y_buf.GetDeviceBuffer();
+        a.rank  = rank;
+        std::copy(shape.begin(), shape.end(), a.shape);
+        std::copy(perm.begin(), perm.end(), a.perm);
+
+        return permute(t, a, stream_config);
+    };
+#if !CK_TILE_USE_WMMA
+#ifdef PERMUTE_USE_ALTERNATIVE_IMPL
+    // batch* n0*n1*n2*k0*k1*k2 -> batch* n0*k0*n1*k1*n2*k2
+    if((arg_parser.get_str("perm") == std::string("0,1,4,2,5,3,6") ||
+        arg_parser.get_str("perm") == std::string("0,1,2,4,5,3,6") ||
+        arg_parser.get_str("perm") == std::string("0,1,3,4,2,5")))
+    {
+        if(arg_parser.get_str("perm") == std::string("0,1,3,4,2,5"))
+        {
+            // b_nr_kr_kw_nw_kv = 2,   // 0,1,3,4,2,5
+            matrix_core_swizzle_traits t;
+            t.data_type = data_type;
+            t.permute   = arg_parser.get_str("perm");
+
+            matrix_core_swizzle_args a;
+            a.p_src = x_buf.GetDeviceBuffer();
+            a.p_dst = y_buf.GetDeviceBuffer();
+            a.batch = shape[0];
+
+            auto nr = shape[1];
+            auto nw = shape[2];
+            auto kr = shape[3];
+            auto kw = shape[4];
+            auto kv = shape[5];
+            a.n     = nr * nw;
+            a.k     = kr * kw * kv;
+            if(kv == 8 && kw == 4 && nw == 16 && nr % 4 == 0 && kr % 8 == 0)
+            {
+                t.inst = "16x16x16";
+                std::cout << ", matrix_core_swizzle_waveflatten_" << t.inst << std::flush;
+
+                ave_time = matrix_core_swizzle(t, a, stream_config);
+            }
+            else if(kv == 8 && kw == 2 && nw == 32 && nr % 4 == 0 && kr % 8 == 0)
+            {
+                t.inst = "32x32x8";
+                std::cout << ", matrix_core_swizzle_waveflatten_" << t.inst << std::flush;
+
+                ave_time = matrix_core_swizzle(t, a, stream_config);
+            }
+            else
+            {
+                ave_time = run_permute();
+            }
+        }
+        else
+        {
+            matrix_core_swizzle_traits t;
+            t.data_type = data_type;
+            t.permute   = arg_parser.get_str("perm");
+
+            matrix_core_swizzle_args a;
+            a.p_src = x_buf.GetDeviceBuffer();
+            a.p_dst = y_buf.GetDeviceBuffer();
+            a.batch = shape[0];
+            a.n     = shape[1] * shape[2] * shape[3];
+            a.k     = shape[4] * shape[5] * shape[6];
+            if(shape[6] == 8 && shape[3] == 32 && shape[5] == 2 && shape[2] == 4 &&
+               shape[4] % 8 == 0 && shape[1] % 2 == 0)
+            {
+                // 32x32x8 inst
+                // perm=0,1,4,2,5,3,6
+                // y_shape=*,2x,8x,4,2,32,8 (3,6,16,4,2,32,8)
+                // shape = *,2x,4,32,8x,2,8 (3,6,4,32,16,2,8)
+
+                t.inst = "32x32x8";
+                std::cout << ", matrix_core_swizzle_" << t.inst << std::flush;
+
+                ave_time = matrix_core_swizzle(t, a, stream_config);
+            }
+            else if(shape[6] == 8 && shape[3] == 16 && shape[5] == 4 && shape[2] == 4 &&
+                    shape[4] % 4 == 0 && shape[1] % 4 == 0)
+            {
+                // 16x16x16 inst
+                // perm=0,1,4,2,5,3,6
+                // y_shape=*,4x,4x,4,4,16,8
+                // shape = *,4x,4,16,4x,4,8 (3,8,4,16,16,4,8)
+                t.inst = "16x16x16";
+                std::cout << ", matrix_core_swizzle_" << t.inst << std::flush;
+
+                ave_time = matrix_core_swizzle(t, a, stream_config);
+            }
+            else
+            {
+                ave_time = run_permute();
+            }
+        }
+    }
+    else
+#endif
+#endif
+    {
+        ave_time = run_permute();
+    }
+    std::cout << ", time:" << ave_time << "ms" << std::flush;
+
+    bool pass = true;
+    if(do_validation)
+    {
+        reference_permute(x, y, perm);
+#if 0
+        if constexpr (std::is_same_v<float, DataType>){
+            // using itype = to_integer_type<sizeof(DataType)>;
+            fflush(stdout);
+            for(int zz = 0; zz < static_cast<int>(x.get_element_size()); zz++   ) {
+                printf("%3.0f ", x.mData[zz]);
+            }
+            printf("->\n");
+            for(int zz = 0; zz < static_cast<int>(x.get_element_size()); zz++   ) {
+                printf("%3.0f ", y.mData[zz]);
+            }
+            fflush(stdout);
+        }
+#endif
+        ck_tile::HostTensor<DataType> y_dev(y.get_lengths());
+
+        y_buf.FromDevice(y_dev.data());
+
+        pass = std::equal(
+            y_dev.begin(), y_dev.end(), y.begin(), [&](const DataType& d, const DataType& h) {
+                using itype = to_integer_type<sizeof(DataType)>;
+                itype i_d   = ck_tile::bit_cast<itype>(d);
+                itype i_h   = ck_tile::bit_cast<itype>(h);
+                return i_d == i_h;
+            });
+        std::cout << ", valid:" << (pass ? "y" : "n") << std::flush;
+    }
+
+    if(arg_parser.get_int("json") == 1)
+    {
+        dump_permute_json_results(arg_parser.get_str("jsonfile"), data_type, pass, ave_time, 0, 0);
+    }
+
+    std::cout << std::endl;
+
+    return pass;
+}
+
+int main(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    const std::string data_type = arg_parser.get_str("prec");
+    if(data_type == "fp8")
+    {
+        return run<ck_tile::fp8_t>(arg_parser) ? 0 : -2;
+    }
+    else if(data_type == "fp16")
+    {
+        return run<ck_tile::half_t>(arg_parser) ? 0 : -2;
+    }
+    else if(data_type == "fp32")
+    {
+        return run<float>(arg_parser) ? 0 : -2;
+    }
+
+    return -3;
+}
--- a/example/ck_tile/06_permute/permute.hpp
+++ b/example/ck_tile/06_permute/permute.hpp
@@ -0,0 +1,19 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/permute.hpp"
+#include <string>
+
+struct permute_traits
+{
+    std::string data_type;
+};
+
+using permute_args = ck_tile::GenericPermuteHostArgs;
+
+// host API
+float permute(permute_traits, permute_args, const ck_tile::stream_config&);
--- a/example/ck_tile/06_permute/script/smoke_test.sh
+++ b/example/ck_tile/06_permute/script/smoke_test.sh
@@ -0,0 +1,37 @@
+#!/bin/sh
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+# TODO: run this script from CK root
+BUILD=build
+EXE=$BUILD/bin/tile_example_permute
+COMMON_ARGS='-v=1 -warmup=0 -repeat=1'
+# mode=0
+# export HIP_VISIBLE_DEVICES=4
+if [ $# -ge 1 ] ; then
+    set -x
+fi
+
+$EXE -prec=fp16 -shape=3,6,4,32,16,2,8 -perm=0,1,4,2,5,3,6  $COMMON_ARGS
+$EXE -prec=fp16 -shape=5,10,4,32,8,2,8 -perm=0,1,4,2,5,3,6  $COMMON_ARGS
+$EXE -prec=fp16 -shape=3,8,4,16,16,4,8 -perm=0,1,4,2,5,3,6  $COMMON_ARGS
+$EXE -prec=fp16 -shape=3,6,4,32,16,2,8 -perm=0,1,2,4,5,3,6  $COMMON_ARGS
+$EXE -prec=fp16 -shape=5,10,4,32,8,2,8 -perm=0,1,2,4,5,3,6  $COMMON_ARGS
+$EXE -prec=fp16 -shape=3,8,4,16,16,4,8 -perm=0,1,2,4,5,3,6  $COMMON_ARGS
+$EXE -prec=fp16 -shape=2,8,16,8,4,8 -perm=0,1,3,4,2,5  $COMMON_ARGS
+$EXE -prec=fp16 -shape=1,24,32,16,2,8 -perm=0,1,3,4,2,5  $COMMON_ARGS
+
+echo "------------------------------------------------------------------"
+
+for prec in "fp8" "fp16" "fp32" ; do
+
+$EXE -prec=$prec -shape=3,8 -perm=1,0 $COMMON_ARGS
+$EXE -prec=$prec -shape=48,6,8 -perm=2,1,0  $COMMON_ARGS
+$EXE -prec=$prec -shape=24,128,3 -perm=0,2,1  $COMMON_ARGS
+$EXE -prec=$prec -shape=4,10,7,6 -perm=0,2,3,1  $COMMON_ARGS
+$EXE -prec=$prec -shape=8,24,36,10 -perm=3,1,2,0  $COMMON_ARGS
+$EXE -prec=$prec -shape=8,1,36,4 -perm=2,1,0,3  $COMMON_ARGS
+$EXE -prec=$prec -shape=5,10,16,2,36,4 -perm=4,5,2,1,0,3  $COMMON_ARGS
+$EXE -prec=$prec -shape=2,32,8,3,6,2,5,4 -perm=5,2,4,7,1,6,3,0  $COMMON_ARGS
+echo "------------------------------------------------------------------"
+done
--- a/example/ck_tile/09_topk_softmax/CMakeLists.txt
+++ b/example/ck_tile/09_topk_softmax/CMakeLists.txt
@@ -0,0 +1,11 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+add_executable(tile_example_topk_softmax topk_softmax.cpp topk_softmax_api.cpp)
+target_include_directories(tile_example_topk_softmax PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/)
+
+set(EXAMPLE_TOPK_SOFTMAX_COMPILE_OPTIONS)
+# NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
+list(APPEND EXAMPLE_TOPK_SOFTMAX_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal)
+# list(APPEND EXAMPLE_TOPK_SOFTMAX_COMPILE_OPTIONS -v --save-temps -Wno-gnu-line-marker)
+target_compile_options(tile_example_topk_softmax PRIVATE ${EXAMPLE_TOPK_SOFTMAX_COMPILE_OPTIONS})
--- a/example/ck_tile/09_topk_softmax/README.md
+++ b/example/ck_tile/09_topk_softmax/README.md
@@ -0,0 +1,74 @@
+# TopK-Softmax with CK Tile
+
+This example demonstrates a tile-programming implementation of TopK-Softmax, commonly used in Mixture-of-Experts (MoE) models to select top-k experts per token after softmax.  This kernel is often used in MoE model, before launching the fused-moe-gemm block. The input is a `token*expert` 2d matrix. The op will do a softmax per row(`expert`), then find the `topk` value for each row. Output is a `token*topk` weight (typically fp32) and index(int32) 2D tensor.
+
+---
+
+## Algorithm and Math
+
+Given a matrix $X$ of shape $[\text{tokens}, \text{experts}]$:
+1. **Softmax per row**: $S_{i,j} = \frac{\exp(X_{i,j})}{\sum_k \exp(X_{i,k})}$
+2. **TopK selection**: For each row $i$, select the $k$ largest $S_{i,j}$ and their indices.
+
+**Output**:  
+- $[\text{tokens}, k]$ weights (fp32)
+- $[\text{tokens}, k]$ indices (int32)
+
+---
+
+## Tile Programming Model
+
+- **Tiles**: Each thread block processes a tile (block of rows).
+- **Pipeline**: Modular, can be extended for fused operations.
+
+---
+
+## Build & Run
+
+```bash
+# in the root of ck_tile
+mkdir build && cd build
+../script/cmake-ck-dev.sh  ../ <arch>  # you can replace this <arch> to gfx90a, gfx942...
+make tile_example_topk_softmax -j
+```
+This will result in an executable `build/bin/tile_example_topk_softmax`
+
+### Arguments
+
+```bash
+args:
+          -v    weather do CPU validation or not (default:1)
+       -pr_i    input data type. fp16/fp32 (representing 8/16/32 bit data) (default:fp16)
+       -pr_w    output weight data type(currently only fp32 supported now) (default:fp32)
+          -t    number of input tokens (default:32)
+          -e    number of experts (default:8)
+          -k    topk (default:2)
+       -st_i    row stride of input, -1 means same as experts (default:-1)
+       -st_o    row stride of output/indices, -1 means same as topk (default:-1)
+       -seed    seed to be used, -1 means random every time (default:-1)
+      -kname    when set to 1 it will print kernel name (default:0)
+       -json    0: No Json, 1: Dump Results in Json format (default:0)
+   -jsonfile    json file name to dump results (default:topk_softmax.json)
+
+```
+
+---
+
+## Source Structure
+
+- **Kernel**: [`topk_softmax_api.hpp`](topk_softmax_api.hpp) (tile-programming kernel template)
+- **Executable**: [`topk_softmax.cpp`](topk_softmax.cpp) (argument parsing, kernel launch)
+- **Build**: `CMakeLists.txt`, `script/`
+
+---
+
+## Related CK Tile Examples
+
+- [15_fused_moe](../15_fused_moe/README.md): Fused MoE block using TopK-Softmax
+- [05_reduce](../05_reduce/README.md): Reductions with tiles
+- [03_gemm](../03_gemm/README.md): GEMM with tiles
+
+For distribution, see [`include/ck_tile/tile_program/tile_distribution/`](../../../include/ck_tile/tile_program/tile_distribution/).
+
+---
+[Back to CK Tile Examples](../README.md)
--- a/example/ck_tile/09_topk_softmax/script/smoke_test.sh
+++ b/example/ck_tile/09_topk_softmax/script/smoke_test.sh
@@ -0,0 +1,25 @@
+#!/bin/sh
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+
+EXE=./build/bin/tile_example_topk_softmax
+
+for pr_i in "fp16" "bf16" ; do
+$EXE -pr_i=$pr_i -t=80 -e=17
+$EXE -pr_i=$pr_i -t=111 -e=117
+$EXE -pr_i=$pr_i -t=1000 -e=55
+$EXE -pr_i=$pr_i -t=99 -e=180
+$EXE -pr_i=$pr_i -t=175 -e=64 -k=8
+$EXE -pr_i=$pr_i -t=65 -e=8 -k=2
+$EXE -pr_i=$pr_i -t=1 -e=25
+$EXE -pr_i=$pr_i -t=31 -e=19 -k=15
+$EXE -pr_i=$pr_i -t=81 -e=37 -k=7
+$EXE -pr_i=$pr_i -t=199 -e=128 -k=13
+$EXE -pr_i=$pr_i -t=23 -e=1 -k=1
+$EXE -pr_i=$pr_i -t=127 -e=99 -k=19 -st_i=233 -st_o=31
+$EXE -pr_i=$pr_i -t=71 -e=11 -k=11 -st_i=30 -st_o=12
+$EXE -pr_i=$pr_i -t=1 -e=1 -k=1
+$EXE -pr_i=$pr_i -t=99 -e=2 -k=1 -st_i=11 -st_o=5
+$EXE -pr_i=$pr_i -t=333 -e=99 -k=13 -st_i=191 -st_o=17
+done
--- a/example/ck_tile/09_topk_softmax/topk_softmax.cpp
+++ b/example/ck_tile/09_topk_softmax/topk_softmax.cpp
@@ -0,0 +1,354 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include <vector>
+#include <iostream>
+#include <numeric>
+#include <cassert>
+#include <cstdlib>
+#include <iostream>
+#include <time.h>
+#include <unordered_set>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/reduce.hpp"
+#include "topk_softmax_api.hpp"
+#include "ck_tile/utility/json_dump.hpp"
+
+#if 0
+template <typename T>
+void dump_host_tensor_2d(const ck_tile::HostTensor<T>& x)
+{
+    auto len = x.get_lengths();
+    assert(len.size() == 2);
+    std::cout << "[";
+    for(size_t i = 0; i < len[0]; i++)
+    {
+        std::cout << i << ": [";
+        for(size_t j = 0; j < len[1]; j++)
+        {
+            if constexpr(std::is_same_v<T, ck_tile::fp16_t>)
+            {
+                auto v = ck_tile::type_convert<float>(x(i, j));
+
+                std::cout << v;
+                if(j != len[1] - 1)
+                    std::cout << ",";
+            }
+            else
+            {
+                std::cout << x(i, j) << " ";
+            }
+        }
+        std::cout << "]";
+        if(i != len[0] - 1)
+            std::cout << ",";
+        else
+            std::cout << "]";
+        std::cout << std::endl;
+    }
+    std::cout << "--------------------" << std::endl;
+}
+#endif
+
+// CPU reference
+template <typename InputType, typename WeightType, typename IndexType = ck_tile::index_t>
+auto reference_topk_softmax(const ck_tile::HostTensor<InputType>& x,
+                            ck_tile::index_t k,
+                            ck_tile::index_t dim = -1,
+                            bool largest         = true,
+                            bool sorted          = true)
+{
+    using namespace ck_tile;
+
+    auto y = reference_softmax<InputType, WeightType, WeightType>(x, dim);
+
+    auto [y_values, y_indices] = reference_topk(y, k, dim, largest, sorted);
+
+    return ck_tile::make_tuple(y_values, y_indices);
+}
+
+template <typename InputType, typename WeightType, typename IndexType = ck_tile::index_t>
+auto reference_topk_softmax(const ck_tile::HostTensor<InputType>& x,
+                            ck_tile::HostTensor<WeightType>& y_values,
+                            ck_tile::HostTensor<IndexType>& y_indices,
+                            ck_tile::index_t k,
+                            ck_tile::index_t dim = -1,
+                            bool largest         = true,
+                            bool sorted          = true)
+{
+    using namespace ck_tile;
+
+    auto y = reference_softmax<InputType, WeightType, WeightType>(x, dim);
+    reference_topk(y, y_values, y_indices, k, dim, largest, sorted);
+}
+
+template <typename InputType, typename WeightType, typename IndexType = ck_tile::index_t>
+auto reference_topk_sigmoid(const ck_tile::HostTensor<InputType>& x,
+                            ck_tile::HostTensor<WeightType>& y_values,
+                            ck_tile::HostTensor<IndexType>& y_indices,
+                            ck_tile::index_t k,
+                            ck_tile::index_t dim = -1,
+                            bool largest         = true,
+                            bool sorted          = true)
+{
+    using namespace ck_tile;
+
+    // topk only - no need to apply the sigmoid first
+    auto x_fp32 = x.template CopyAsType<float>();
+    reference_topk(x_fp32, y_values, y_indices, k, dim, largest, sorted);
+    // apply sigmoid
+    std::transform(y_values.begin(), y_values.end(), y_values.begin(), [](auto value) {
+        return WeightType(1) / (WeightType(1) + exp(-value));
+    });
+}
+
+// different threshold for different dtype
+template <typename DataType>
+auto get_elimit(std::string /*init_method*/)
+{
+    double rtol = 1e-3;
+    double atol = 1e-3;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<ck_tile::bf16_t>(std::string /*init_method*/)
+{
+    double rtol = 1e-2;
+    double atol = 1e-2;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<ck_tile::fp8_t>(std::string init_method)
+{
+    if(init_method == "ui" || init_method == "ni")
+    {
+        unsigned max_rounding_point_distance = 0;
+        double atol                          = 2e-3;
+        return ck_tile::make_tuple(max_rounding_point_distance, atol);
+    }
+    else
+    {
+        unsigned max_rounding_point_distance = 1;
+        double atol                          = 0.0625;
+        return ck_tile::make_tuple(max_rounding_point_distance, atol);
+    }
+}
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("v", "1", "weather do CPU validation or not")
+        .insert("pr_i", "fp16", "input data type. fp16/fp32 (representing 8/16/32 bit data)")
+        .insert("pr_w", "fp32", "output weight data type(currently only fp32 supported now)")
+        .insert("t", "32", "number of input tokens")
+        .insert("e", "8", "number of experts")
+        .insert("k", "2", "topk")
+        .insert("st_i", "-1", "row stride of input, -1 means same as experts")
+        .insert("st_o", "-1", "row stride of output/indices, -1 means same as topk")
+        .insert("seed", "-1", "seed to be used, -1 means random every time")
+        .insert("kname", "0", "when set to 1 it will print kernel name")
+        .insert("warmup", "5", "number of iterations before benchmark the kernel")
+        .insert("repeat", "20", "number of iterations to benchmark the kernel")
+        .insert("json", "0", "0: No Json, 1: Dump Results in Json format")
+        .insert("jsonfile", "topk_softmax.json", "json file name to dump results")
+        .insert("activation", "softmax", "activation function to use: softmax or sigmoid");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+template <typename InputType, typename WeightType, typename IndexType = ck_tile::index_t>
+bool test_topk_softmax(ck_tile::ArgParser args)
+{
+    int validate            = args.get_int("v");
+    std::string input_prec  = args.get_str("pr_i");
+    std::string weight_prec = args.get_str("pr_w");
+    int tokens              = args.get_int("t");
+    int experts             = args.get_int("e");
+    int topk                = args.get_int("k");
+    int seed                = args.get_int("seed");
+    int stride_input        = args.get_int("st_i");
+    int stride_output       = args.get_int("st_o");
+    int kname               = args.get_int("kname");
+    int warmup              = args.get_int("warmup");
+    int repeat              = args.get_int("repeat");
+    std::string activation  = args.get_str("activation");
+
+    if(stride_input < 0)
+    {
+        stride_input = experts;
+    }
+    if(stride_output < 0)
+    {
+        stride_output = topk;
+    }
+    assert(stride_input >= experts);
+    assert(stride_output >= topk);
+
+    if(seed < 0)
+    {
+        seed = std::time(nullptr);
+    }
+
+    if(topk > experts)
+    {
+        printf("topk:%d value should be smaller than, or equal to number of experts:%d\n",
+               topk,
+               experts);
+        return false;
+    }
+
+    // tokens already considered batch size
+    ck_tile::HostTensor<InputType> x_host({tokens, experts}, {stride_input, 1});
+    ck_tile::HostTensor<WeightType> value_host({tokens, topk}, {stride_output, 1});
+    ck_tile::HostTensor<IndexType> index_host({tokens, topk}, {stride_output, 1});
+
+    {
+        // random require per-row unique
+        auto rand_gen = ck_tile::FillUniformDistribution_Unique<InputType>{
+            -5.f, 5.f, static_cast<uint32_t>(seed)};
+
+        for(int i_t = 0; i_t < tokens; i_t++)
+        {
+            ck_tile::HostTensor<InputType> x_row({experts});
+            rand_gen(x_row);
+            std::copy(x_row.begin(), x_row.end(), x_host.begin() + i_t * stride_input);
+            rand_gen.clear();
+        }
+    }
+
+    ck_tile::DeviceMem x_dev(x_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem value_dev(value_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem index_dev(index_host.get_element_space_size_in_bytes());
+
+    x_dev.ToDevice(x_host.data());
+
+    topk_softmax_trait trait{input_prec, weight_prec, experts, activation};
+
+    topk_softmax_kargs karg{x_dev.GetDeviceBuffer(),
+                            value_dev.GetDeviceBuffer(),
+                            index_dev.GetDeviceBuffer(),
+                            tokens,
+                            experts,
+                            topk,
+                            stride_input,
+                            stride_output};
+
+    ck_tile::stream_config sc{nullptr,
+                              true,
+                              /* log_level = */ (kname ? 1 : 0),
+                              warmup,
+                              repeat};
+    auto ms = topk_softmax(trait, karg, sc);
+    printf("[%s|%s]tokens:%d, experts:%d, topk:%d, st_i:%d, st_o:%d, activation:%s, ms:%f, ",
+           input_prec.c_str(),
+           weight_prec.c_str(),
+           tokens,
+           experts,
+           topk,
+           stride_input,
+           stride_output,
+           activation.c_str(),
+           ms);
+    if(ms < 0)
+        printf("not supported\n");
+    fflush(stdout);
+    if(ms < 0)
+    {
+        return false;
+    }
+
+    value_dev.FromDevice(value_host.data());
+    index_dev.FromDevice(index_host.data());
+
+    bool rtn = true;
+    if(validate)
+    {
+        ck_tile::HostTensor<WeightType> value_ref({tokens, topk}, {stride_output, 1});
+        ck_tile::HostTensor<IndexType> index_ref({tokens, topk}, {stride_output, 1});
+
+        if(activation == "softmax")
+        {
+            reference_topk_softmax<InputType, WeightType, IndexType>(
+                x_host, value_ref, index_ref, topk);
+        }
+        else if(activation == "sigmoid")
+        {
+            reference_topk_sigmoid<InputType, WeightType, IndexType>(
+                x_host, value_ref, index_ref, topk);
+        }
+        else
+        {
+            throw std::runtime_error("unsupported activation type: " + activation);
+        }
+
+        auto [rtol, atol] = get_elimit<InputType>("");
+        for(int i_t = 0; i_t < tokens; i_t++)
+        {
+            auto s_begin = std::vector<size_t>{static_cast<size_t>(i_t), static_cast<size_t>(0)};
+            auto s_end =
+                std::vector<size_t>{static_cast<size_t>(i_t + 1), static_cast<size_t>(topk)};
+            auto s_value_host = value_host.slice(s_begin, s_end);
+            auto s_value_ref  = value_ref.slice(s_begin, s_end);
+            rtn &= ck_tile::check_err(s_value_host,
+                                      s_value_ref,
+                                      std::string("[") + std::to_string(i_t) +
+                                          std::string("] Value Error:"),
+                                      rtol,
+                                      atol);
+            auto s_index_host = index_host.slice(s_begin, s_end);
+            auto s_index_ref  = index_ref.slice(s_begin, s_end);
+            rtn &= ck_tile::check_err(s_index_host,
+                                      s_index_ref,
+                                      std::string("[") + std::to_string(i_t) +
+                                          std::string("] Index Error:"),
+                                      rtol,
+                                      atol);
+        }
+    }
+
+    printf("valid:%s\n", rtn ? "y" : "n");
+
+    if(args.get_int("json") == 1)
+    {
+        dump_topk_softmax_json(args.get_str("jsonfile"),
+                               input_prec,
+                               weight_prec,
+                               tokens,
+                               experts,
+                               topk,
+                               stride_input,
+                               stride_output,
+                               ms,
+                               0,
+                               0,
+                               rtn);
+    }
+
+    fflush(stdout);
+    return rtn;
+}
+
+int main(int argc, char** argv)
+{
+    auto [result, args] = create_args(argc, argv);
+    if(!result)
+        return -1;
+    std::string input_prec  = args.get_str("pr_i");
+    std::string weight_prec = args.get_str("pr_w");
+
+    bool r = true;
+    if(input_prec.compare("fp16") == 0 && weight_prec.compare("fp32") == 0)
+    {
+        r &= test_topk_softmax<ck_tile::fp16_t, float, ck_tile::index_t>(args);
+    }
+    else if(input_prec.compare("bf16") == 0 && weight_prec.compare("fp32") == 0)
+    {
+        r &= test_topk_softmax<ck_tile::bf16_t, float, ck_tile::index_t>(args);
+    }
+
+    return r ? 0 : -1;
+}
--- a/example/ck_tile/09_topk_softmax/topk_softmax_api.cpp
+++ b/example/ck_tile/09_topk_softmax/topk_softmax_api.cpp
@@ -0,0 +1,169 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "topk_softmax_api.hpp"
+
+#define TOPK_SOFTMAX_DISPATCH(experts_, use_softmax_)                                              \
+    constexpr ck_tile::index_t ts_experts = experts_;                                              \
+    constexpr bool ts_use_softmax         = use_softmax_;                                          \
+    using ts_problem                      = ck_tile::TopkSoftmaxWarpPerRowProblem<ts_input_type,   \
+                                                                                  ts_weight_type,  \
+                                                                                  ts_index_type,   \
+                                                                                  ts_experts,      \
+                                                                                  ts_use_softmax>; \
+    using ts_pipeline                     = ck_tile::TopkSoftmaxWarpPerRowPipeline<ts_problem>;    \
+                                                                                                   \
+    using kernel = ck_tile::TopkSoftmaxKernel<ts_pipeline>;                                        \
+                                                                                                   \
+    auto kargs = kernel::MakeKargs(a);                                                             \
+                                                                                                   \
+    const dim3 grids  = kernel::GridSize(a);                                                       \
+    const dim3 blocks = kernel::BlockSize();                                                       \
+                                                                                                   \
+    float ave_time =                                                                               \
+        ck_tile::launch_kernel(s, ck_tile::make_kernel<1>(kernel{}, grids, blocks, 0, kargs));     \
+                                                                                                   \
+    return ave_time;
+
+float topk_softmax(topk_softmax_trait t, topk_softmax_kargs a, ck_tile::stream_config s)
+{
+    if(t.input_type == "fp16" && t.weight_type == "fp32" && t.activation == "softmax")
+    {
+        using ts_input_type  = ck_tile::fp16_t;
+        using ts_weight_type = float;
+        using ts_index_type  = ck_tile::index_t;
+#if 1
+        if(t.experts <= 8)
+        {
+            TOPK_SOFTMAX_DISPATCH(8, true)
+        }
+        else if(t.experts <= 16)
+        {
+            TOPK_SOFTMAX_DISPATCH(16, true)
+        }
+        else if(t.experts <= 32)
+        {
+            TOPK_SOFTMAX_DISPATCH(32, true)
+        }
+        else if(t.experts <= 64)
+        {
+            TOPK_SOFTMAX_DISPATCH(64, true)
+        }
+        else if(t.experts <= 128)
+        {
+            TOPK_SOFTMAX_DISPATCH(128, true)
+        }
+        else if(t.experts <= 192)
+        {
+            TOPK_SOFTMAX_DISPATCH(192, true)
+        }
+#else
+        if(t.experts <= 128)
+        {
+            TOPK_SOFTMAX_DISPATCH(128, true)
+        }
+#endif
+    }
+    else if(t.input_type == "bf16" && t.weight_type == "fp32" && t.activation == "softmax")
+    {
+#if 1
+        using ts_input_type  = ck_tile::bf16_t;
+        using ts_weight_type = float;
+        using ts_index_type  = ck_tile::index_t;
+        if(t.experts <= 8)
+        {
+            TOPK_SOFTMAX_DISPATCH(8, true)
+        }
+        else if(t.experts <= 16)
+        {
+            TOPK_SOFTMAX_DISPATCH(16, true)
+        }
+        else if(t.experts <= 32)
+        {
+            TOPK_SOFTMAX_DISPATCH(32, true)
+        }
+        else if(t.experts <= 64)
+        {
+            TOPK_SOFTMAX_DISPATCH(64, true)
+        }
+        else if(t.experts <= 128)
+        {
+            TOPK_SOFTMAX_DISPATCH(128, true)
+        }
+        else if(t.experts <= 192)
+        {
+            TOPK_SOFTMAX_DISPATCH(192, true)
+        }
+#endif
+    }
+    else if(t.input_type == "fp16" && t.weight_type == "fp32" && t.activation == "sigmoid")
+    {
+        using ts_input_type  = ck_tile::fp16_t;
+        using ts_weight_type = float;
+        using ts_index_type  = ck_tile::index_t;
+#if 1
+        if(t.experts <= 8)
+        {
+            TOPK_SOFTMAX_DISPATCH(8, false)
+        }
+        else if(t.experts <= 16)
+        {
+            TOPK_SOFTMAX_DISPATCH(16, false)
+        }
+        else if(t.experts <= 32)
+        {
+            TOPK_SOFTMAX_DISPATCH(32, false)
+        }
+        else if(t.experts <= 64)
+        {
+            TOPK_SOFTMAX_DISPATCH(64, false)
+        }
+        else if(t.experts <= 128)
+        {
+            TOPK_SOFTMAX_DISPATCH(128, false)
+        }
+        else if(t.experts <= 192)
+        {
+            TOPK_SOFTMAX_DISPATCH(192, false)
+        }
+#else
+        if(t.experts <= 128)
+        {
+            TOPK_SOFTMAX_DISPATCH(128, false)
+        }
+#endif
+    }
+    else if(t.input_type == "bf16" && t.weight_type == "fp32" && t.activation == "sigmoid")
+    {
+#if 1
+        using ts_input_type  = ck_tile::bf16_t;
+        using ts_weight_type = float;
+        using ts_index_type  = ck_tile::index_t;
+        if(t.experts <= 8)
+        {
+            TOPK_SOFTMAX_DISPATCH(8, false)
+        }
+        else if(t.experts <= 16)
+        {
+            TOPK_SOFTMAX_DISPATCH(16, false)
+        }
+        else if(t.experts <= 32)
+        {
+            TOPK_SOFTMAX_DISPATCH(32, false)
+        }
+        else if(t.experts <= 64)
+        {
+            TOPK_SOFTMAX_DISPATCH(64, false)
+        }
+        else if(t.experts <= 128)
+        {
+            TOPK_SOFTMAX_DISPATCH(128, false)
+        }
+        else if(t.experts <= 192)
+        {
+            TOPK_SOFTMAX_DISPATCH(192, false)
+        }
+#endif
+    }
+    return -1;
+}
--- a/example/ck_tile/09_topk_softmax/topk_softmax_api.hpp
+++ b/example/ck_tile/09_topk_softmax/topk_softmax_api.hpp
@@ -0,0 +1,22 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+#include "ck_tile/ops/topk_softmax.hpp"
+#include <string>
+
+struct topk_softmax_trait
+{
+    std::string input_type;
+    std::string weight_type; // currently always float
+    int experts;
+    std::string activation; // "softmax" or "sigmoid"
+};
+
+struct topk_softmax_kargs : public ck_tile::TopkSoftmaxHostArgs
+{
+};
+
+float topk_softmax(topk_softmax_trait t, topk_softmax_kargs a, ck_tile::stream_config s);
--- a/example/ck_tile/10_rmsnorm2d/CMakeLists.txt
+++ b/example/ck_tile/10_rmsnorm2d/CMakeLists.txt
@@ -0,0 +1,51 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+set(RMSNORM2D_FWD_KNOWN_APIS "fwd;bwd")
+set(RMSNORM2D_FWD_ENABLE_APIS  "fwd" CACHE STRING
+    "semicolon-separated list of APIs to generate (${RMSNORM2D_FWD_KNOWN_APIS}) & link, or \"all\".")
+if(RMSNORM2D_FWD_ENABLE_APIS  STREQUAL "all")
+  set(RMSNORM2D_FWD_ENABLE_APIS  ${RMSNORM2D_FWD_KNOWN_APIS})
+endif()
+
+# generate a list of kernels, but not actually emit files at config sta
+execute_process(
+  COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/generate.py
+  --api ${RMSNORM2D_FWD_ENABLE_APIS} --working_path ${CMAKE_CURRENT_BINARY_DIR} --list_blobs
+  RESULT_VARIABLE ret
+)
+if(ret AND NOT ret EQUAL 0)
+  message( FATAL_ERROR "Fail to generate kernels via Python. ${ret}")
+endif()
+
+file(STRINGS ${CMAKE_CURRENT_BINARY_DIR}/rmsnorm2d_fwd_blobs.txt RMSNORM2D_FWD_GEN_BLOBS)
+
+add_custom_command(
+  OUTPUT ${RMSNORM2D_FWD_GEN_BLOBS}
+  COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/generate.py
+  --api ${RMSNORM2D_FWD_ENABLE_APIS} --working_path ${CMAKE_CURRENT_BINARY_DIR} --gen_blobs
+)
+
+set(TILE_RMSNORM2D_FWD "tile_rmsnorm2d_fwd")
+
+message(DEBUG "adding ${TILE_RMSNORM2D_FWD}")
+add_executable(${TILE_RMSNORM2D_FWD} rmsnorm2d_fwd.cpp)
+target_include_directories(${TILE_RMSNORM2D_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
+target_sources(${TILE_RMSNORM2D_FWD} PRIVATE ${RMSNORM2D_FWD_GEN_BLOBS})
+
+set(TILE_RMSNORM2D_FWD_COMPILE_OPTIONS)
+
+# NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
+list(APPEND TILE_RMSNORM2D_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal --offload-compress)
+
+target_compile_options(${TILE_RMSNORM2D_FWD} PRIVATE ${TILE_RMSNORM2D_FWD_COMPILE_OPTIONS})
+
+set(EXAMPLE_RMSNORM2D_FWD "tile_example_rmsnorm2d_fwd")
+add_executable(${EXAMPLE_RMSNORM2D_FWD} example_rmsnorm2d_fwd.cpp)
+target_compile_options(${EXAMPLE_RMSNORM2D_FWD} PRIVATE ${TILE_RMSNORM2D_FWD_COMPILE_OPTIONS})
+
+# TODO: we have to turn off this global prop, otherwise the progress bar generated
+# by cmake will print too many files, execvp: /bin/sh: Argument list too long
+# however, this property may affect global
+# TODO: consider codegen a makefile by us
+set_property(GLOBAL PROPERTY RULE_MESSAGES OFF)
--- a/Show More
+++ b/Show More