add code

2026-05-12 01:10:17 +00:00 · 2024-02-28 22:57:19 +00:00
parent e60c5aea4e
commit f69356b1d7
130 changed files with 28268 additions and 0 deletions
--- a/example/ck_tile/01_fmha/CMakeLists.txt
+++ b/example/ck_tile/01_fmha/CMakeLists.txt
@@ -0,0 +1,41 @@
+# generate a list of kernels, but not actually emit files at config stage
+execute_process(
+  COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/generate.py
+  --list_blobs ${CMAKE_CURRENT_BINARY_DIR}/blob_list.txt
+)
+
+# NOTE: for cmake, the FMHA_FWD_GEN_BLOBS files must be in the same directory
+#       as current cmake list, otherwise will not figure out the dependency properly
+file(STRINGS ${CMAKE_CURRENT_BINARY_DIR}/blob_list.txt FMHA_FWD_GEN_BLOBS)
+
+add_custom_command(
+  OUTPUT ${FMHA_FWD_GEN_BLOBS}
+  COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/generate.py
+  --output_dir ${CMAKE_CURRENT_BINARY_DIR}
+)
+
+set(EXAMPLE_FMHA_FWD "example_fmha_fwd")
+add_example_executable(${EXAMPLE_FMHA_FWD} fmha_fwd.cpp)
+target_include_directories(${EXAMPLE_FMHA_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
+target_sources(${EXAMPLE_FMHA_FWD} PRIVATE ${FMHA_FWD_GEN_BLOBS})
+
+# NOTE: this is dangerous since will change the whole kernel to flush denormals
+#       WIP with compiler team for an exp2 intrinsic..., then remove this
+if(NOT DEFINED FMHA_FWD_FAST_EXP2)
+    set(FMHA_FWD_FAST_EXP2 true)
+endif()
+
+set(EXAMPLE_FMHA_FWD_COMPILE_OPTIONS)
+
+# NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
+#       ... because they are auto-generated
+if(FMHA_FWD_FAST_EXP2)
+	list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -DCK_FMHA_FWD_FAST_EXP2=1 -fgpu-flush-denormals-to-zero -v --save-temps -Wno-gnu-line-marker)
+else()
+	list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -DCK_FMHA_FWD_FAST_EXP2=0 -v --save-temps -Wno-gnu-line-marker)
+endif()
+
+# Allow comparing floating points directly in order to check sentinel values
+list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -Wno-float-equal)
+
+target_compile_options(${EXAMPLE_FMHA_FWD} PRIVATE ${EXAMPLE_FMHA_FWD_COMPILE_OPTIONS})
--- a/example/ck_tile/01_fmha/README.md
+++ b/example/ck_tile/01_fmha/README.md
@@ -0,0 +1,94 @@
+# fused multi-head attention
+
+This folder contains example for fmha(fused multi-head attention) using ck_tile tile-programming implementation. It is a good example to demonstrate the usage of tile-programming API, as well as illustrate the new approach to construct a kernel template and instantiate it(them) while keeping compile time fast.
+
+## build
+```
+# in the root of ck_tile
+mkdir build && cd build
+sh ../script/cmake-ck_tile-dev.sh  ../ <arch>  # you can replace this <arch> to gfx90a, gfx942...
+make example_fmha_fwd -j
+```
+This will result in an executable `build/bin/example_fmha_fwd`
+
+## kernel
+The kernel template is `fmha_fwd_kernel.hpp`, this is the grid-wise op in old ck_tile's terminology. We put it here purposely, to demonstrate one can construct a kernel by using various internal component from ck_tile. We may still have an implementation under ck_tile's include path (in the future) for the kernel template.
+
+There are 3 template parameters for this kernel template.
+* `TilePartitioner` is used to map the workgroup to corresponding tile, `fmha_fwd_tile_partitioner.hpp` in this folder served as this purpose.
+* `FmhaPipeline` is one of the block_tile_pipeline(under `include/ck_tile/tile_program/block_tile_pipeline`) which is a performance critical component. Indeed, we did a lot of optimization and trials to optimize the pipeline and may still workout more performance pipeline and update into that folder. People only need to replace this pipeline type and would be able to enjoy the benefit of different performant implementations (stay tuned for updated pipeline(s)).
+* `EpiloguePipeline` will modify and store out the result in the last phase. People usually will do lot of post-fusion at this stage, so we also abstract this concept. Currently we didn't do much thing at the epilogue stage but leave the room for future possible support.
+
+## codegen
+To speed up compile time, we instantiate the kernels into separate file. In this way we can benefit from parallel building from CMake/Make system. This is achieved by `generate.py` script. Besides, you can look into this script to learn how to instantiate a kernel instance step by step, which is described in `FMHA_FWD_KERNEL_BODY` variable.
+
+## executable
+`example_fmha_fwd` is the example executable, implemented in `fmha_fwd.cpp`. You can type `./bin/example_fmha_fwd -?` to list all supported args. Below is an example of the output (may subject to change)
+```
+args:
+          -v    weather do CPU validation or not (default:1)
+       -mode    kernel mode. 0:batch, 1:group (default:0)
+          -b    batch size (default:2)
+          -h    num of head, for q (default:8)
+        -h_k    num of head, for k/v, 0 means equal to h (default:0)
+                 if not equal to h, then this is GQA/MQA case
+          -s    seqlen_q (default:3328)
+        -s_k    seqlen_k, 0 means equal to s (default:0)
+          -d    head dim for q, k (default:128)
+        -d_v    head dim for v, 0 means equal to d (default:0)
+      -scale    scale factor. 0 means equal to 1/sqrt(hdim) (default:0)
+  -descale_q    scale factor for fp8 quantization (default:1)
+  -descale_k    scale factor for fp8 quantization (default:1)
+  -descale_v    scale factor for fp8 quantization (default:1)
+      -iperm    permute input (default:1)
+                 if true, will be b*h*s*d, else b*s*h*d
+      -operm    permute output (default:1)
+       -bias    add bias or not (default:0)
+       -prec    data type. fp16/bf16/fp8/bf8 (default:fp16)
+       -mask    0: no mask, 1: top-left, 2:bottom-right (default:0)
+                 't:l,r', top-left local-attn with left right size
+                 'b:l,r', bottom-r local-attn with left right size
+                 'g:y,x', generic attention mask coordinate with y/x size
+                 
+    -vlayout    r for row-major(seqlen*hdim), c for col-major(hdim*seqlen) (default:r)
+        -lse    0 not store lse, 1 store lse (default:0)
+      -kname    if set to 1 will print kernel name (default:0)
+```
+Example: `./bin/example_fmha_fwd -b=1 -h=16 -s=16384 -d=128` will run a fmha case with batch=1, nhead=16, sequence length=16384, hdim=128, fp16 case.
+
+## support features
+Currently we are still in rapid development stage, so more features/optimizations will be coming soon.
+
+### hdim
+Currently we support `32/64/128/256` hdim for `fp16`/`bf16`, within which `64`/`128` is better optimized. hdim should be multiple of 8, while seqlen_s can be arbitrary. For hdim be arbitrary number, it can be support through padding kernel of `qr` pipeline (we didn't generate this in generate.py by default)
+
+### group/batch mode
+Currently we support both batch and group mode, by setting `-mode` = `0` or `1`, where in group mode we support each batch can have different seqlen
+
+### MQA/GQA
+By setting `-h`(nhead for q) and `-h_k`(nhead for k/v) with different number, you can achieve MQA/GQA. Please pay attention that `h % h_K == 0` when you set different numbers.
+
+### input/output permute, and `b*s*3*h*d`
+If you look at the kernel argument inside `fmha_fwd_kernel.hpp`, we support providing arbitrary stride for seqlen(stride_q/k/v), nhead, batch of q/k/v matrix, hence it is very flexible to support `b*h*s*d` or `b*s*h*d` input/output permute. The `-iperm=0/1`, `-operm=0/1` is a convenient way to achieve this through the executable. We didn't provide a command-line arg to test `b*s*3*h*d` layout which is by default used by torch/FA, but it's trivial to achieve this if one set the proper `stride_q/k/v` value as `3*h*d`.
+
+### attention bias
+Attention bias is supported with the layout of `1*1*s*s`(similiar to input/output, different layout can be supported by changing the stride value for bias, or even extend to `b*h*s*s`) and bias value in float number.
+
+### lse
+For training kernels, "log sum exp" need to store out in forward and used in backward. We support this by setting `-lse=1`
+
+### vlayout
+We support v matrix in both row-major(`seqlen*hdim`) and col-major(`hdim*seqlen`). Since the accumulate(reduce) dimension for V is along `seqlen`, for current AMD's mfma layout which expect each thread to have contiguous register holding pixels along reduce dimension, it's easier to support col-major V layout. However, the performance of col-major is not necessarily faster than row-major, there are many factors that may affect the overall performance. We still provide the `-vlayout=r/c` here to switch/test between different layouts.
+
+### generic attention mask coordinate
+We unify the mask expression into generic attention mask coordinate, providing an uniformed approach to describe causal top-left, causal bottom-right, local attention.
+![](misc/gamc.png)
+
+(more description to be added)
+
+### dropout
+TBD
+
+## FP8 experimental support
+As described in [this blog](https://blog.hippoml.com/8bit-hippoattention-up-to-3x-faster-compared-to-flashattentionv2-8f9def90b482), we have an experimental support for fp8 fmha kernels, you can evaluate the performance by setting the arg `-prec=fp8` to the `example_fmha_fwd`, on a gfx940/941/942 machine and ROCm 6.0+. Currently if you not explicitly setting `-v=0`(which will disable CPU verification), it will printout an error as much as `0.05`. We are still WIP to tune the kernel performance as well as the precision, so stay tuned for the updated performance(pipeline)
+Currently we only support `-vlayout=c` for fp8, which is `hdim*seqlen` for V matrix. row major for V matrix support will come later.
--- a/example/ck_tile/01_fmha/fmha_fwd.cpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.cpp
@@ -0,0 +1,521 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <array>
+#include <cstring>
+#include <functional>
+#include <numeric>
+#include <ostream>
+#include <string>
+#include <tuple>
+#include <utility>
+
+#include "fmha_fwd.hpp"
+#include "ck_tile/host.hpp"
+#include "mask.hpp"
+#include "utils.hpp"
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("v", "1", "weather do CPU validation or not")
+        .insert("mode", "0", "kernel mode. 0:batch, 1:group")
+        .insert("b", "2", "batch size")
+        .insert("h", "8", "num of head, for q")
+        .insert("h_k",
+                "0",
+                "num of head, for k/v, 0 means equal to h\n"
+                "if not equal to h, then this is GQA/MQA case")
+        .insert("s", "3328", "seqlen_q")
+        .insert("s_k", "0", "seqlen_k, 0 means equal to s")
+        .insert("d", "128", "head dim for q, k")
+        .insert("d_v", "0", "head dim for v, 0 means equal to d")
+        .insert("scale", "0", "scale factor. 0 means equal to 1/sqrt(hdim)")
+        .insert("descale_q", "1", "scale factor for fp8 quantization")
+        .insert("descale_k", "1", "scale factor for fp8 quantization")
+        .insert("descale_v", "1", "scale factor for fp8 quantization")
+        .insert("iperm",
+                "1",
+                "permute input\n"
+                "if true, will be b*h*s*d, else b*s*h*d")
+        .insert("operm", "1", "permute output")
+        .insert("bias", "0", "add bias or not")
+        .insert("prec", "fp16", "data type. fp16/bf16/fp8/bf8")
+        .insert("mask",
+                "0",
+                "0: no mask, 1: top-left, 2:bottom-right\n"
+                "'t:l,r', top-left local-attn with left right size\n"
+                "'b:l,r', bottom-r local-attn with left right size\n"
+                "'g:y,x', generic attention mask coordinate with y/x size\n")
+        .insert("vlayout", "r", "r for row-major(seqlen*hdim), c for col-major(hdim*seqlen)")
+        .insert("lse", "0", "0 not store lse, 1 store lse")
+        .insert("kname", "0", "if set to 1 will print kernel name")
+        .insert("init", "1", "init method. 0:random int, 1:random float, 2:trig float")
+        .insert("seed",
+                "11939",
+                "random seed used for initializing input tensors. 0 to use "
+                "non-deterministic random number as seed")
+        .insert("warmup", "5", "number of iterations before benchmark the kernel")
+        .insert("repeat", "20", "number of iterations to benchmark the kernel");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+// different threshold for different dtype
+template <typename DataType>
+auto get_elimit(int /*init_method*/)
+{
+    double rtol = 1e-3;
+    double atol = 1e-3;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<ck_tile::bhalf_t>(int init_method)
+{
+    if(init_method == 0)
+    {
+        double rtol = 1e-2;
+        double atol = 1e-2;
+        return ck_tile::make_tuple(rtol, atol);
+    }
+    else
+    {
+        double rtol = 3e-3;
+        double atol = 3e-3;
+        return ck_tile::make_tuple(rtol, atol);
+    }
+}
+
+template <typename DataType>
+bool run(const ck_tile::ArgParser& arg_parser)
+{
+    std::string data_type = arg_parser.get_str("prec");
+    int do_validation     = arg_parser.get_int("v");
+    auto mode             = static_cast<mode_enum>(arg_parser.get_uint32("mode"));
+    ck_tile::index_t batch     = arg_parser.get_int("b");
+    ck_tile::index_t nhead     = arg_parser.get_int("h");
+    ck_tile::index_t nhead_k   = arg_parser.get_int("h_k");
+    if(nhead_k == 0)
+        nhead_k = nhead;
+
+    if(nhead % nhead_k != 0)
+    {
+        std::cerr << "nhead:" << nhead << " must be multiple of nhead_k:" << nhead_k << std::endl;
+        return false;
+    }
+
+    ck_tile::index_t seqlen_q = arg_parser.get_int("s");
+    ck_tile::index_t seqlen_k = arg_parser.get_int("s_k");
+    if(seqlen_k == 0)
+        seqlen_k = seqlen_q;
+    ck_tile::index_t hdim_q = arg_parser.get_int("d");
+    ck_tile::index_t hdim_v = arg_parser.get_int("d_v");
+    if(hdim_v == 0)
+        hdim_v = hdim_q;
+
+    bool i_perm = arg_parser.get_bool("iperm"); // if true, will be batch * nhead * seqlen * hdim
+    bool o_perm = arg_parser.get_bool("operm"); // if false, will be batch * seqlen * nhead * hdim
+
+    float scale = arg_parser.get_float("scale");
+    if(scale == .0f)
+        scale = 1.0 / ck_tile::sqrt(static_cast<float>(hdim_q)); // TODO: q ? v ?
+
+    float descale_q = arg_parser.get_float("descale_q");
+    float descale_k = arg_parser.get_float("descale_k");
+    float descale_v = arg_parser.get_float("descale_v");
+
+    std::string vlayout = arg_parser.get_str("vlayout");
+    bool use_bias       = arg_parser.get_bool("bias");
+    bool lse            = arg_parser.get_bool("lse");
+
+    mask_info mask = mask_info::decode(arg_parser.get_str("mask"), seqlen_q, seqlen_k);
+
+    int init_method              = arg_parser.get_int("init");
+    std::optional<uint32_t> seed = arg_parser.get_uint32("seed");
+    if(*seed == 0)
+    {
+        seed.reset();
+    }
+
+    int stream_warmup = arg_parser.get_int("warmup");
+    int stream_repeat = arg_parser.get_int("repeat");
+    bool kname        = arg_parser.get_bool("kname");
+
+    stream_config stream_config{
+        nullptr, true, /* log_level = */ (kname ? 1 : 0), stream_warmup, stream_repeat};
+
+    const auto seqstart_q_host = generate_seqstarts(mode, batch, seqlen_q);
+    const auto seqstart_k_host = generate_seqstarts(mode, batch, seqlen_k);
+
+    using TypeConfig = FmhaFwdTypeConfig<DataType>;
+
+    using QDataType           = typename TypeConfig::QDataType;
+    using KDataType           = typename TypeConfig::KDataType;
+    using VDataType           = typename TypeConfig::VDataType;
+    using BiasDataType        = typename TypeConfig::BiasDataType;
+    using LSEDataType         = typename TypeConfig::LSEDataType;
+    using SaccDataType        = typename TypeConfig::SaccDataType;
+    using SMPLComputeDataType = typename TypeConfig::SMPLComputeDataType;
+    using PDataType           = typename TypeConfig::PDataType;
+    using OaccDataType        = typename TypeConfig::OaccDataType;
+    using ODataType           = typename TypeConfig::ODataType;
+
+    // accumulation numbers for performance evaluation
+    std::size_t flop = 0, num_byte = 0;
+    auto max_seqlen_q =
+        std::numeric_limits<int32_t>::min(); // we will use max seqlen to decide grid size
+    {
+        for(ck_tile::index_t wb = 0; wb < batch; ++wb)
+        {
+            const int32_t real_seqlen_q = seqstart_q_host[wb + 1] - seqstart_q_host[wb];
+            const int32_t real_seqlen_k = seqstart_k_host[wb + 1] - seqstart_k_host[wb];
+
+            if(max_seqlen_q < real_seqlen_q)
+            {
+                max_seqlen_q = real_seqlen_q;
+            }
+
+            flop += nhead * (static_cast<std::size_t>(2) * real_seqlen_q * real_seqlen_k * hdim_q +
+                             static_cast<std::size_t>(2) * real_seqlen_q * hdim_v * real_seqlen_k);
+
+            num_byte += nhead * (sizeof(QDataType) * real_seqlen_q * hdim_q +
+                                 sizeof(KDataType) * real_seqlen_k * hdim_q +
+                                 sizeof(VDataType) * hdim_v * real_seqlen_k +
+                                 sizeof(ODataType) * real_seqlen_q * hdim_v);
+        }
+    }
+
+    auto get_lengths = [&](bool permute,
+                           ck_tile::index_t b /*batch*/,
+                           ck_tile::index_t h /*nhead*/,
+                           ck_tile::index_t s /*seqlen*/,
+                           ck_tile::index_t d /*hdim*/) {
+        if(permute)
+            return std::array<ck_tile::index_t, 4>{b, h, s, d};
+        else
+            return std::array<ck_tile::index_t, 4>{b, s, h, d};
+    };
+
+    bool is_v_rowmajor = vlayout == std::string("r");
+
+    // host memory for storing all the tensor elements
+    const ck_tile::index_t shape_batch = (mode == mode_enum::batch ? batch : 1);
+    const ck_tile::index_t shape_seqlen_q =
+        (mode == mode_enum::batch ? seqlen_q : seqstart_q_host.back());
+    const ck_tile::index_t shape_seqlen_k =
+        (mode == mode_enum::batch ? seqlen_k : seqstart_k_host.back());
+
+    HostTensor<QDataType> q_host(get_lengths(i_perm, shape_batch, nhead, shape_seqlen_q, hdim_q));
+    HostTensor<KDataType> k_host(get_lengths(i_perm, shape_batch, nhead_k, shape_seqlen_k, hdim_q));
+    HostTensor<VDataType> v_host(
+        is_v_rowmajor ? get_lengths(i_perm, shape_batch, nhead_k, shape_seqlen_k, hdim_v)
+                      : get_lengths(i_perm, shape_batch, nhead_k, hdim_v, shape_seqlen_k));
+    // use bias shape = [1, 1, shape_seqlen_q, shape_seqlen_k]. if use_bias=false, the bias_host
+    // will not be used for verification at all (but will be copied to device anyway).
+    HostTensor<BiasDataType> bias_host(
+        use_bias ? get_lengths(i_perm, 1, 1, shape_seqlen_q, shape_seqlen_k)
+                 : std::array<ck_tile::index_t, 4>{1, 1, 1, 1} /* dummy shape for simplifying code */);
+    // self define lse data layout as [shape_batch, nhead, shape_seqlen_q]
+    HostTensor<LSEDataType> lse_host(
+        lse ? std::array<ck_tile::index_t, 3>{shape_batch, nhead, shape_seqlen_q}
+            : std::array<ck_tile::index_t, 3>{1, 1, 1} /* dummy shape for simplifying code */);
+
+    HostTensor<ODataType> o_host(get_lengths(o_perm, shape_batch, nhead, shape_seqlen_q, hdim_v));
+
+    if(init_method == 0)
+    {
+        ck_tile::utils::FillUniformDistributionIntegerValue<QDataType>{-2.f, 2.f, seed}(q_host);
+        ck_tile::utils::FillUniformDistributionIntegerValue<KDataType>{-2.f, 2.f, seed}(k_host);
+        ck_tile::utils::FillUniformDistributionIntegerValue<VDataType>{-2.f, 2.f, seed}(v_host);
+        ck_tile::utils::FillUniformDistributionIntegerValue<BiasDataType>{-2.f, 2.f, seed}(bias_host);
+    }
+    else if(init_method == 1)
+    {
+        ck_tile::utils::FillUniformDistribution<QDataType>{0.f, 1.f, seed}(q_host);
+        ck_tile::utils::FillUniformDistribution<KDataType>{0.f, 1.f, seed}(k_host);
+        ck_tile::utils::FillUniformDistribution<VDataType>{0.f, 1.f, seed}(v_host);
+        ck_tile::utils::FillUniformDistribution<BiasDataType>{0.f, 1.f, seed}(bias_host);
+    }
+    else if(init_method == 2)
+    {
+        ck_tile::utils::FillTrigValue<QDataType>{}(q_host);
+        ck_tile::utils::FillTrigValue<KDataType>{}(k_host);
+        ck_tile::utils::FillTrigValue<VDataType>{}(v_host);
+        ck_tile::utils::FillTrigValue<BiasDataType>{}(bias_host);
+    }
+
+    DeviceMem q_buf(q_host.get_element_space_size_in_bytes());
+    DeviceMem k_buf(k_host.get_element_space_size_in_bytes());
+    DeviceMem v_buf(v_host.get_element_space_size_in_bytes());
+    DeviceMem bias_buf(bias_host.get_element_space_size_in_bytes());
+    DeviceMem lse_buf(lse_host.get_element_space_size_in_bytes());
+    DeviceMem o_buf(o_host.get_element_space_size_in_bytes());
+    DeviceMem seqstart_q(seqstart_q_host.size() * sizeof(int32_t));
+    DeviceMem seqstart_k(seqstart_k_host.size() * sizeof(int32_t));
+
+    q_buf.ToDevice(q_host.data());
+    k_buf.ToDevice(k_host.data());
+    v_buf.ToDevice(v_host.data());
+    bias_buf.ToDevice(bias_host.data());
+    seqstart_q.ToDevice(seqstart_q_host.data());
+    seqstart_k.ToDevice(seqstart_k_host.data());
+
+    // clang-format off
+    auto layout_str = [&](bool permute){
+        if (permute) return std::string("bhsd");
+        else return std::string("bshd");
+    };
+    auto io_layout = [&](bool iperm_, bool operm_) {
+        if (iperm_ == operm_) return layout_str(iperm_);
+        else return layout_str(iperm_) + std::string("-") + layout_str(operm_);
+    };
+    // clang-format on
+    const std::string prec = arg_parser.get_str("prec");
+
+    std::cout << "[" << prec << "|" << mode << "|" << io_layout(i_perm, o_perm) << "] b:" << batch
+              << ", h:" << nhead << "/" << nhead_k << ", s:" << seqlen_q << "/" << seqlen_k
+              << ", d:" << hdim_q << "/" << hdim_v << ", scale:" << scale << ", bias:" << use_bias
+              << ", lse:" << lse << ", mask:" << mask << ", v:" << vlayout << std::flush;
+
+    auto fmha_traits = fmha_fwd_traits{hdim_q,
+                                       hdim_v,
+                                       data_type,
+                                       mode == mode_enum::group,
+                                       is_v_rowmajor,
+                                       mask.type,
+                                       use_bias,
+                                       lse};
+    auto fmha_args   = fmha_fwd_args{q_buf.GetDeviceBuffer(),
+                                   k_buf.GetDeviceBuffer(),
+                                   v_buf.GetDeviceBuffer(),
+                                   bias_buf.GetDeviceBuffer(),
+                                   lse_buf.GetDeviceBuffer(),
+                                   o_buf.GetDeviceBuffer(),
+                                   seqstart_q.GetDeviceBuffer(),
+                                   seqstart_k.GetDeviceBuffer(),
+                                   nullptr,
+                                   batch,
+                                   nhead,
+                                   nhead_k,
+                                   shape_seqlen_q,
+                                   shape_seqlen_k,
+                                   hdim_q,
+                                   hdim_v,
+                                   max_seqlen_q,
+                                   scale,
+                                   descale_q * descale_k,
+                                   descale_v,
+                                   i_perm,
+                                   o_perm,
+                                   mask.y,
+                                   mask.x};
+
+    float ave_time = fmha_fwd(fmha_traits, fmha_args, stream_config);
+
+    if(ave_time < 0)
+    {
+        std::cout << ", not supported yet" << std::flush << std::endl;
+        return false;
+    }
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_byte / 1.E6 / ave_time;
+
+    std::cout << std::fixed << ", " << std::setprecision(3) << ave_time << " ms, "
+              << std::setprecision(2) << tflops << " TFlops, " << std::setprecision(2) << gb_per_sec
+              << " GB/s" << std::flush;
+
+    if(!do_validation)
+    {
+        std::cout << std::flush << std::endl;
+        return true;
+    }
+
+    o_buf.FromDevice(o_host.data());
+    lse_buf.FromDevice(lse_host.data());
+
+    bool pass = true;
+
+    for(ck_tile::index_t wb = 0; wb < batch; ++wb)
+    {
+        const ck_tile::index_t real_seqlen_q = seqstart_q_host[wb + 1] - seqstart_q_host[wb];
+        const ck_tile::index_t real_seqlen_k = seqstart_k_host[wb + 1] - seqstart_k_host[wb];
+
+        // adjust matrix index according to the mode
+        const ck_tile::index_t b            = (mode == mode_enum::batch ? wb : 0);
+        const ck_tile::index_t query_offset = (mode == mode_enum::batch ? 0 : seqstart_q_host[wb]);
+        const ck_tile::index_t key_offset   = (mode == mode_enum::batch ? 0 : seqstart_k_host[wb]);
+
+        const auto v_host_ref_lengths = std::array<ck_tile::index_t, 3>{nhead, hdim_v, real_seqlen_k};
+        const auto v_host_ref_strides =
+            is_v_rowmajor ? std::array<ck_tile::index_t, 3>{hdim_v * real_seqlen_k, 1, hdim_v}
+                          : std::array<ck_tile::index_t, 3>{hdim_v * real_seqlen_k, real_seqlen_k, 1};
+
+        HostTensor<QDataType> q_host_ref({nhead, real_seqlen_q, hdim_q});
+        HostTensor<KDataType> k_host_ref({nhead, real_seqlen_k, hdim_q});
+        HostTensor<VDataType> v_host_ref(v_host_ref_lengths, v_host_ref_strides);
+        HostTensor<ODataType> o_host_ref({nhead, real_seqlen_q, hdim_v});
+
+        HostTensor<SMPLComputeDataType> s_host_ref({nhead, real_seqlen_q, real_seqlen_k});
+        HostTensor<PDataType> p_host_ref({nhead, real_seqlen_q, real_seqlen_k});
+        HostTensor<SMPLComputeDataType> lse_host_ref({nhead, real_seqlen_q});
+
+        ck_tile::index_t nr = nhead / nhead_k;
+
+        // clang-format off
+        // permute
+        if(i_perm) q_host_ref.ForEach([&](auto& self, auto i) { self(i) = q_host(b, i[0], i[1] + query_offset, i[2]); });
+        else       q_host_ref.ForEach([&](auto& self, auto i) { self(i) = q_host(b, i[1] + query_offset, i[0], i[2]); });
+
+        if(i_perm) k_host_ref.ForEach([&](auto& self, auto i) { self(i) = k_host(b, i[0] / nr, i[1] + key_offset, i[2]); });
+        else       k_host_ref.ForEach([&](auto& self, auto i) { self(i) = k_host(b, i[1] + key_offset, i[0] / nr, i[2]); });
+
+        if (is_v_rowmajor) {
+            //                                                             v_host_ref: [nhead, hdim, seq], v_host: [b, h_k, s, d]
+            if(i_perm) v_host_ref.ForEach([&](auto& self, auto i) { self(i) = v_host(b, i[0] / nr, i[2] + key_offset, i[1]); });
+            //                                                             v_host_ref: [nhead, hdim, seq], v_host: [b, s, h_k, d]
+            else       v_host_ref.ForEach([&](auto& self, auto i) { self(i) = v_host(b, i[2] + key_offset, i[0] / nr, i[1]); });
+        }
+        else {
+            if(i_perm) v_host_ref.ForEach([&](auto& self, auto i) { self(i) = v_host(b, i[0] / nr, i[1], i[2] + key_offset); });
+            else       v_host_ref.ForEach([&](auto& self, auto i) { self(i) = v_host(b, i[1], i[0] / nr, i[2] + key_offset); });
+        }
+        // clang-format on
+
+        // reference
+        reference_batched_gemm<QDataType, KDataType, SaccDataType, SMPLComputeDataType>(
+            q_host_ref,
+            k_host_ref,
+            s_host_ref,
+            ck_tile::identity{},
+            ck_tile::identity{},
+            [&](SaccDataType x) { return scale * x; });
+
+        if(use_bias)
+        {
+            HostTensor<BiasDataType> bias_host_ref({1, real_seqlen_q, real_seqlen_k});
+            // clang-format off
+            if(i_perm)
+                bias_host_ref.ForEach([&](auto& self, auto i) { self(i) = bias_host(0, 0, i[1] + query_offset, i[2] + key_offset); });
+            else
+                bias_host_ref.ForEach([&](auto& self, auto i) { self(i) = bias_host(0, i[1] + query_offset, 0, i[2] + key_offset); });
+            // clang-format on
+
+            // broadcast from [1, real_seqlen_q, real_seqlen_k] to [nhead, real_seqlen_q,
+            // real_seqlen_k]
+            reference_batched_elementwise<SMPLComputeDataType,
+                                          BiasDataType,
+                                          SMPLComputeDataType,
+                                          SMPLComputeDataType>(
+                s_host_ref, bias_host_ref, s_host_ref);
+        }
+
+        if(mask.type == mask_enum::no_mask)
+        {
+            reference_batched_masking<SaccDataType>(
+                s_host_ref, FmhaMasks::NoMask{real_seqlen_q, real_seqlen_k});
+        }
+        else if(mask.type == mask_enum::window_generic)
+        {
+            reference_batched_masking<SaccDataType>(
+                s_host_ref, FmhaMasks::GenericMask{mask.y, mask.x, real_seqlen_q, real_seqlen_k});
+        }
+        else
+        {
+            reference_batched_masking<SaccDataType>(
+                s_host_ref, FmhaMasks::CausalMask{mask.y, mask.x, real_seqlen_q, real_seqlen_k});
+        }
+        if(lse)
+        {
+            reference_batched_softmax<SMPLComputeDataType, SMPLComputeDataType, PDataType>(
+                s_host_ref, p_host_ref, lse_host_ref);
+        }
+        else
+        {
+            reference_batched_softmax<SMPLComputeDataType, SMPLComputeDataType, PDataType>(
+                s_host_ref, p_host_ref);
+        }
+
+        reference_batched_gemm<PDataType, VDataType, OaccDataType, ODataType>(
+            p_host_ref, v_host_ref, o_host_ref);
+
+        HostTensor<ODataType> o_host_result({nhead, real_seqlen_q, hdim_v});
+        // clang-format off
+        // permute
+        if(o_perm) o_host_result.ForEach([&](auto& self, auto idx) { self(idx) = o_host(b, idx[0], idx[1] + query_offset, idx[2]); });
+        else       o_host_result.ForEach([&](auto& self, auto idx) { self(idx) = o_host(b, idx[1] + query_offset, idx[0], idx[2]); });
+        // clang-format on
+
+        auto [rtol, atol] = get_elimit<DataType>(init_method);
+        bool cur_pass     = ck_tile::utils::check_err(
+            o_host_result, o_host_ref, std::string("OUT Error: Incorrect results!"), rtol, atol);
+        pass &= cur_pass;
+        if(!cur_pass)
+        {
+            std::cerr << "OUT mismatch found at batch: " << wb << std::endl
+                      << "\tseqlen_q: " << real_seqlen_q << std::endl
+                      << "\tseqlen_k: " << real_seqlen_k << std::endl
+                      << "\tseqstart_q: " << seqstart_q_host << std::endl
+                      << "\tseqstart_k: " << seqstart_k_host << std::endl;
+
+            break;
+        }
+
+        if(lse)
+        {
+            HostTensor<SMPLComputeDataType> lse_host_result({nhead, real_seqlen_q});
+            lse_host_result.ForEach([&](auto& self, auto idx) {
+                self(idx) = lse_host(b, idx[0], idx[1] + query_offset);
+            });
+
+            bool lse_pass = ck_tile::utils::check_err(lse_host_result,
+                                                 lse_host_ref,
+                                                 "LSE Error: Incorrect results!",
+                                                 rtol,
+                                                 atol,
+                                                 /* allow_infinity_ref = */ true);
+
+            pass &= lse_pass;
+            if(!cur_pass)
+            {
+                std::cerr << "LSE mismatch found at batch: " << wb << std::endl
+                          << "\tseqlen_q: " << real_seqlen_q << std::endl
+                          << "\tseqlen_k: " << real_seqlen_k << std::endl
+                          << "\tseqstart_q: " << seqstart_q_host << std::endl
+                          << "\tseqstart_k: " << seqstart_k_host << std::endl;
+
+                break;
+            }
+        }
+    }
+
+    std::cout << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl;
+
+    return pass;
+}
+
+int main(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return -1;
+
+    const std::string data_type = arg_parser.get_str("prec");
+    if(data_type == "fp16")
+    {
+        return run<ck_tile::half_t>(arg_parser) ? 0 : -2;
+    }
+    else if(data_type == "bf16")
+    {
+        return run<ck_tile::bhalf_t>(arg_parser) ? 0 : -2;
+    }
+    else if(data_type == "fp8")
+    {
+        return run<ck_tile::fp8_t>(arg_parser) ? 0 : -2;
+    }
+
+    return -3;
+}
--- a/example/ck_tile/01_fmha/fmha_fwd.hpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.hpp
@@ -0,0 +1,336 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/fmha.hpp"
+#include "ck_tile/ops/epilogue.hpp"
+#include "ck_tile/ops/common.hpp"
+#include "mask.hpp"
+
+template <typename DataType>
+struct FmhaFwdTypeConfig;
+
+template <>
+struct FmhaFwdTypeConfig<ck_tile::half_t>
+{
+    using QDataType           = ck_tile::half_t;
+    using KDataType           = ck_tile::half_t;
+    using VDataType           = ck_tile::half_t;
+    using BiasDataType        = ck_tile::half_t;
+    using LSEDataType         = float;      // data type for lse(logsumexp L_j = max_j + log(l_j))
+    using SaccDataType        = float;      // data type for first gemm accumulation
+    using SMPLComputeDataType = float;      // data type for reduction, softmax
+    using PDataType           = ck_tile::half_t; // data type for A matrix of second gemm
+    using OaccDataType        = float;      // data type for second gemm accumulation
+    using ODataType           = ck_tile::half_t;
+};
+
+template <>
+struct FmhaFwdTypeConfig<ck_tile::bhalf_t>
+{
+    using QDataType           = ck_tile::bhalf_t;
+    using KDataType           = ck_tile::bhalf_t;
+    using VDataType           = ck_tile::bhalf_t;
+    using BiasDataType        = ck_tile::bhalf_t;
+    using LSEDataType         = float;       // data type for lse(logsumexp L_j = max_j + log(l_j))
+    using SaccDataType        = float;       // data type for first gemm accumulation
+    using SMPLComputeDataType = float;       // data type for reduction, softmax
+    using PDataType           = ck_tile::bhalf_t; // data type for A matrix of second gemm
+    using OaccDataType        = float;       // data type for second gemm accumulation
+    using ODataType           = ck_tile::bhalf_t;
+};
+
+template <>
+struct FmhaFwdTypeConfig<ck_tile::fp8_t>
+{
+    using QDataType           = ck_tile::fp8_t;
+    using KDataType           = ck_tile::fp8_t;
+    using VDataType           = ck_tile::fp8_t;
+    using BiasDataType        = float;    // TODO: fix me
+    using LSEDataType         = float;    // data type for lse(logsumexp L_j = max_j + log(l_j))
+    using SaccDataType        = float;    // data type for first gemm accumulation
+    using SMPLComputeDataType = float;    // data type for reduction, softmax
+    using PDataType           = ck_tile::fp8_t; // data type for A matrix of second gemm
+    using OaccDataType        = float;    // data type for second gemm accumulation
+    using ODataType           = ck_tile::fp8_t;
+};
+
+template <>
+struct FmhaFwdTypeConfig<ck_tile::bf8_t>
+{
+    using QDataType           = ck_tile::bf8_t;
+    using KDataType           = ck_tile::bf8_t;
+    using VDataType           = ck_tile::bf8_t;
+    using BiasDataType        = ck_tile::bf8_t;
+    using LSEDataType         = float;     // data type for lse(logsumexp L_j = max_j + log(l_j))
+    using SaccDataType        = float;     // data type for first gemm accumulation
+    using SMPLComputeDataType = float;     // data type for reduction, softmax
+    using PDataType           = ck_tile::bf8_t; // data type for A matrix of second gemm
+    using OaccDataType        = float;     // data type for second gemm accumulation
+    using ODataType           = ck_tile::bf8_t;
+};
+
+struct FmhaMasks
+{
+    using NoMask      = ck_tile::GenericAttentionMask<false>;
+    using GenericMask = ck_tile::GenericAttentionMask<true, true>;
+    using CausalMask  = ck_tile::GenericAttentionMask<true, false>;
+};
+
+// internal API, don't use this directly
+template <typename FmhaKernel>
+auto fmha_fwd_create_kargs_and_grids(const void* q_ptr,
+                                     const void* k_ptr,
+                                     const void* v_ptr,
+                                     const void* bias_ptr,
+                                     void* lse_ptr,
+                                     void* o_ptr,
+                                     const void* seqstart_q_ptr,
+                                     const void* seqstart_k_ptr,
+                                     const void* seqlen_k_ptr,
+                                     ck_tile::index_t batch,
+                                     ck_tile::index_t nhead,
+                                     ck_tile::index_t nhead_k,
+                                     ck_tile::index_t seqlen_q,
+                                     ck_tile::index_t seqlen_k,
+                                     ck_tile::index_t hdim_q,
+                                     ck_tile::index_t hdim_v,
+                                     ck_tile::index_t max_seqlen_q,
+                                     float scale,
+                                     float descale_qk,
+                                     float descale_sv,
+                                     bool i_perm,
+                                     bool o_perm,
+                                     ck_tile::index_t mask_y,
+                                     ck_tile::index_t mask_x)
+{
+    constexpr bool is_v_rowmajor =
+        ck_tile::is_same_v<typename FmhaKernel::VLayout, ck_tile::tensor_layout::gemm::RowMajor>;
+
+    assert(nhead % nhead_k == 0);
+    /// NOTE: we broadcast bias from [1, 1, seqlen_q, seqlen_k] to [batch, nhead, seqlen_q,
+    ///       seqlen_k] in this example, hence both the 'batch_stride_bias' & 'nhead_stride_bias'
+    ///       are 0.
+    // setup stride_* arguments
+    const ck_tile::index_t stride_q = (i_perm ? hdim_q : nhead * hdim_q);
+    const ck_tile::index_t stride_k = (i_perm ? hdim_q : nhead_k * hdim_q);
+    const ck_tile::index_t stride_v = [&]() {
+        if constexpr(is_v_rowmajor)
+            return i_perm ? hdim_v : nhead_k * hdim_v;
+        else
+            return i_perm ? seqlen_k : nhead_k * seqlen_k;
+    }();
+    const ck_tile::index_t stride_bias = (i_perm ? seqlen_k : 1 * seqlen_k);
+    const ck_tile::index_t stride_o    = (o_perm ? hdim_v : nhead * hdim_v);
+    // setup nhead_stride_* arguments
+    const ck_tile::index_t nhead_stride_q = (i_perm ? seqlen_q * hdim_q : hdim_q);
+    const ck_tile::index_t nhead_stride_k = (i_perm ? seqlen_k * hdim_q : hdim_q);
+    const ck_tile::index_t nhead_stride_v = [&]() {
+        if constexpr(is_v_rowmajor)
+            return i_perm ? seqlen_k * hdim_v : hdim_v;
+        else
+            return i_perm ? hdim_v * seqlen_k : seqlen_k;
+    }();
+    const ck_tile::index_t nhead_stride_bias = (i_perm ? 0 * seqlen_q * seqlen_k : 0 * seqlen_k);
+    const ck_tile::index_t nhead_stride_lse  = (seqlen_q * 1);
+    const ck_tile::index_t nhead_stride_o    = (o_perm ? seqlen_q * hdim_v : hdim_v);
+    // setup batch_stride_* arguments
+    const ck_tile::index_t batch_stride_q    = (nhead * seqlen_q * hdim_q);
+    const ck_tile::index_t batch_stride_k    = (nhead_k * seqlen_k * hdim_q);
+    const ck_tile::index_t batch_stride_v    = (nhead_k * hdim_v * seqlen_k);
+    const ck_tile::index_t batch_stride_bias = (0 * nhead * seqlen_q * seqlen_k);
+    const ck_tile::index_t batch_stride_lse  = (nhead * seqlen_q * 1);
+    const ck_tile::index_t batch_stride_o    = (nhead * seqlen_q * hdim_v);
+
+    auto kargs = [&] {
+        // create group mode kernel arguments
+        if constexpr(FmhaKernel::kIsGroupMode)
+        {
+            return FmhaKernel::MakeKargs(q_ptr,
+                                         k_ptr,
+                                         v_ptr,
+                                         bias_ptr,
+                                         lse_ptr,
+                                         o_ptr,
+                                         seqstart_q_ptr,
+                                         seqstart_k_ptr,
+                                         seqlen_k_ptr,
+                                         hdim_q,
+                                         hdim_v,
+                                         nhead / nhead_k,
+                                         scale,
+                                         stride_q,
+                                         stride_k,
+                                         stride_v,
+                                         stride_bias,
+                                         stride_o,
+                                         nhead_stride_q,
+                                         nhead_stride_k,
+                                         nhead_stride_v,
+                                         nhead_stride_bias,
+                                         nhead_stride_lse,
+                                         nhead_stride_o,
+                                         mask_y,
+                                         mask_x,
+                                         descale_qk,
+                                         descale_sv);
+        }
+        else
+        { // create batch mode kernel arguments
+            return FmhaKernel::MakeKargs(q_ptr,
+                                         k_ptr,
+                                         v_ptr,
+                                         bias_ptr,
+                                         lse_ptr,
+                                         o_ptr,
+                                         seqlen_q,
+                                         seqlen_k,
+                                         hdim_q,
+                                         hdim_v,
+                                         nhead / nhead_k,
+                                         scale,
+                                         stride_q,
+                                         stride_k,
+                                         stride_v,
+                                         stride_bias,
+                                         stride_o,
+                                         nhead_stride_q,
+                                         nhead_stride_k,
+                                         nhead_stride_v,
+                                         nhead_stride_bias,
+                                         nhead_stride_lse,
+                                         nhead_stride_o,
+                                         batch_stride_q,
+                                         batch_stride_k,
+                                         batch_stride_v,
+                                         batch_stride_bias,
+                                         batch_stride_lse,
+                                         batch_stride_o,
+                                         mask_y,
+                                         mask_x,
+                                         descale_qk,
+                                         descale_sv);
+        }
+    }();
+
+    dim3 grids = FmhaKernel::GridSize(batch, nhead, max_seqlen_q, hdim_v);
+    return ck_tile::make_tuple(kargs, grids);
+}
+
+// This is the args from caller to underneath API, different from the kernel
+struct fmha_fwd_args
+{
+    const void* q_ptr;
+    const void* k_ptr;
+    const void* v_ptr;
+    const void* bias_ptr;
+    void* lse_ptr;
+    void* o_ptr;
+    const void* seqstart_q_ptr;
+    const void* seqstart_k_ptr;
+    const void* seqlen_k_ptr;
+    ck_tile::index_t batch;
+    ck_tile::index_t nhead;
+    ck_tile::index_t nhead_k;
+    ck_tile::index_t seqlen_q;
+    ck_tile::index_t seqlen_k;
+    ck_tile::index_t hdim_q;
+    ck_tile::index_t hdim_v;
+    ck_tile::index_t max_seqlen_q;
+    float scale;
+    float descale_qk;
+    float descale_sv;
+    bool i_perm;
+    bool o_perm;
+    ck_tile::index_t mask_y;
+    ck_tile::index_t mask_x;
+};
+
+template <typename FmhaKernel>
+auto fmha_fwd_create_kargs_and_grids(fmha_fwd_args args)
+{
+    return fmha_fwd_create_kargs_and_grids<FmhaKernel>(args.q_ptr,
+                                                       args.k_ptr,
+                                                       args.v_ptr,
+                                                       args.bias_ptr,
+                                                       args.lse_ptr,
+                                                       args.o_ptr,
+                                                       args.seqstart_q_ptr,
+                                                       args.seqstart_k_ptr,
+                                                       args.seqlen_k_ptr,
+                                                       args.batch,
+                                                       args.nhead,
+                                                       args.nhead_k,
+                                                       args.seqlen_q,
+                                                       args.seqlen_k,
+                                                       args.hdim_q,
+                                                       args.hdim_v,
+                                                       args.max_seqlen_q,
+                                                       args.scale,
+                                                       args.descale_qk,
+                                                       args.descale_sv,
+                                                       args.i_perm,
+                                                       args.o_perm,
+                                                       args.mask_y,
+                                                       args.mask_x);
+}
+
+// this is used to pattern-match internl kernel implementation, not to instantiate kernel
+template <ck_tile::index_t HDim_,
+          typename DataType_,
+          bool kIsGroupMode_,
+          ck_tile::index_t kM0_,
+          ck_tile::index_t kN0_,
+          ck_tile::index_t kK0_,
+          ck_tile::index_t kN1_,
+          ck_tile::index_t kK1_,
+          ck_tile::index_t kK0BlockLength_,
+          bool kIsVLayoutRowMajor_,
+          typename FmhaMask_,
+          bool kHasBias_,
+          bool kStoreLse_,
+          bool kPadS_,
+          bool kPadSK_,
+          bool kPadD_,
+          bool kPadDv_>
+struct fmha_fwd_traits_
+{
+    static constexpr ck_tile::index_t HDim           = HDim_;
+    using DataType                              = ck_tile::remove_cvref_t<DataType_>;
+    static constexpr bool kIsGroupMode          = kIsGroupMode_;
+    static constexpr ck_tile::index_t kM0            = kM0_;
+    static constexpr ck_tile::index_t kN0            = kN0_;
+    static constexpr ck_tile::index_t kK0            = kK0_;
+    static constexpr ck_tile::index_t kN1            = kN1_;
+    static constexpr ck_tile::index_t kK1            = kK1_;
+    static constexpr ck_tile::index_t kK0BlockLength = kK0BlockLength_;
+    static constexpr bool kIsVLayoutRowMajor    = kIsVLayoutRowMajor_;
+    using FmhaMask                              = ck_tile::remove_cvref_t<FmhaMask_>;
+    static constexpr bool kHasBias              = kHasBias_;
+    static constexpr bool kStoreLse             = kStoreLse_;
+    static constexpr bool kPadS                 = kPadS_;
+    static constexpr bool kPadSK                = kPadSK_;
+    static constexpr bool kPadD                 = kPadD_;
+    static constexpr bool kPadDv                = kPadDv_;
+};
+
+template <typename Traits_>
+float fmha_fwd_(const stream_config&, fmha_fwd_args);
+
+// This is the public API, will be generated by script
+struct fmha_fwd_traits
+{
+    int hdim_q;
+    int hdim_v;
+    std::string data_type;
+    bool is_group_mode;
+    bool is_v_rowmajor;
+    mask_enum mask_type;
+    bool has_bias;
+    bool has_lse;
+    // TODO: padding check is inside this api
+};
+float fmha_fwd(fmha_fwd_traits, fmha_fwd_args, const stream_config&);
--- a/example/ck_tile/01_fmha/generate.py
+++ b/example/ck_tile/01_fmha/generate.py
@@ -0,0 +1,500 @@
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+# generate kernel instances to speed up compilation
+
+import argparse
+import itertools
+from pathlib import Path
+from typing import List, Optional, tuple
+from dataclasses import dataclass
+import copy
+
+DTYPE_MAP = {
+    "fp16": "ck_tile::half_t",
+    "bf16": "ck_tile::bhalf_t",
+    "fp8" : "ck_tile::fp8_t"
+}
+
+DTYPE_BITS = {
+    "fp32": 32,
+    "fp16": 16,
+    "bf16": 16,
+    "fp8" : 8,
+    "bf8" : 8
+}
+
+MASK_MAP = {
+    "no" : "FmhaMasks::NoMask",
+    "causal" : "FmhaMasks::CausalMask",
+    "generic" : "FmhaMasks::GenericMask"
+}
+
+MODE_MAP = {
+    "batch" : "false",
+    "group" : "true"
+}
+
+LAYOUT_MAP = {
+    "row" : "true",
+    "col" : "false"
+}
+
+PIPELINE_MAP = {
+    "qr" : "ck_tile::BlockFmhaPipelineQRKSVS",
+    "qr_fp8" : "ck_tile::BlockFmhaPipelineQRKSVSFp8",
+    "qr_async" : "ck_tile::BlockFmhaPipelineQRKSVSAsync",
+}
+
+BOOL_MAP = {
+    "t" : "true",
+    "f" : "false"
+}
+
+MASKS = ["no", "causal", "generic"]
+DIRECTIONS = ["fwd"]
+GEN_DIR = ""    # in Cmake, have to generate files in same folder
+
+FMHA_FWD_KERNEL_HEADER = """// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.\n
+// auto generated by generate.py
+#include "fmha_fwd.hpp"
+"""
+
+FMHA_FWD_KERNEL_BODY="""
+using fmha_dtype_{F_idx} = {F_dtype};
+
+using fmha_block_tile_{F_idx} = ck_tile::sequence<{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0blen}>;
+using fmha_block_warps_{F_idx} = ck_tile::sequence<{F_rm}, {F_rn}, {F_rk}>;
+using fmha_warp_tile_{F_idx} = ck_tile::sequence<{F_wm}, {F_wn}, {F_wk}>;
+
+using fmha_shape_{F_idx} = ck_tile::TileFmhaShape<fmha_block_tile_{F_idx},
+                                      fmha_block_warps_{F_idx},
+                                      fmha_warp_tile_{F_idx},
+                                      fmha_block_warps_{F_idx},
+                                      fmha_warp_tile_{F_idx},
+                                      {F_vlayout}>;
+
+using fmha_trait_{F_idx} = ck_tile::TileFmhaTraits<{F_spad},
+                                                    {F_skpad},
+                                                    {F_dpad},
+                                                    {F_dvpad},
+                                                    {F_bias},
+                                                    {F_lse},
+                                                    {F_occupancy}>;
+using fmha_mask_{F_idx} = {F_mask};
+
+using fmha_pipeline_problem_{F_idx} = ck_tile::BlockFmhaPipelineProblem<
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::QDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::KDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::VDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::SaccDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::SMPLComputeDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::BiasDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::LSEDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::PDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::OaccDataType,
+    typename FmhaFwdTypeConfig<fmha_dtype_{F_idx}>::ODataType,
+    fmha_shape_{F_idx},
+    {F_mode},
+    fmha_mask_{F_idx},
+    fmha_trait_{F_idx}>;
+
+using fmha_pipeline_{F_idx} = {F_pipeline}<
+    fmha_pipeline_problem_{F_idx}>;
+
+using fmha_epilogue_{F_idx} =
+    ck_tile::FmhaFwdEpilogue<FmhaFwdEpilogueProblem<typename FmhaFwdTypeConfig<{F_dtype}>::OaccDataType,
+                                           typename FmhaFwdTypeConfig<{F_dtype}>::ODataType,
+                                           {F_spad}, {F_dvpad}>>;
+
+using fmha_kernel_{F_idx} = 
+    ck_tile::FmhaFwdKernel<FmhaFwdTilePartitioner<fmha_shape_{F_idx}>,
+                  fmha_pipeline_{F_idx},
+                  fmha_epilogue_{F_idx}>;
+
+using trait_{F_idx} = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode},{F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0blen}, {F_vlayout}, fmha_mask_{F_idx}, {F_bias}, {F_lse}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>;
+
+#include <iostream>
+
+template<>
+float fmha_fwd_<trait_{F_idx}>(const stream_config& s, fmha_fwd_args a)
+{{
+    using k_ = fmha_kernel_{F_idx};
+    if(s.log_level_ > 0)
+        std::cout << ", " << k_::GetName() << std::flush;
+    auto [kargs, grids] = fmha_fwd_create_kargs_and_grids<k_>(a);
+    constexpr dim3 blocks             = k_::BlockSize();
+    constexpr ck_tile::index_t kBlockPerCu = k_::kBlockPerCu;
+    return ck_tile::launch_kernel<blocks.x, kBlockPerCu>(s, k_{{}}, grids, blocks, 0, kargs);
+}}
+"""
+
+FMHA_FWD_API_FILENAME="fmha_fwd_api.cpp"
+FMHA_FWD_API="""
+float fmha_fwd(fmha_fwd_traits t, fmha_fwd_args a, const stream_config& s){{
+    float r = -1;
+{F_dispatch}
+    return r;
+}}
+"""
+
+FMHA_FWD_API_PER_DTYPE="""    {F_if}(t.data_type.compare(\"{F_dtype}\") == 0){{
+{F_hdim_case}
+    }}
+"""
+FMHA_FWD_API_PER_HDIM_CASE="""        {F_if} (t.hdim_q <= {F_hdim} && t.hdim_v <= {F_hdim}) {{
+{F_inner_dispatch}
+        }}
+"""
+MASK_CHECK_MAP = {
+    "no" : "t.mask_type == mask_enum::no_mask",
+    "causal" : "t.mask_type == mask_enum::causal_top_left || t.mask_type == mask_enum::causal_bottom_right",
+    "generic" : "t.mask_type == mask_enum::window_generic",
+}
+
+FMHA_FWD_API_INNER_DISPATCH="""            {F_if}((t.is_group_mode == {F_mode}) && (t.is_v_rowmajor == {F_vlayout}) && ({F_mask_check}) && (t.has_bias == {F_bias}) && (t.has_lse == {F_lse}) &&
+                        ({F_scheck}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck})) {{
+                using trait_ = fmha_fwd_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0blen}, {F_vlayout}, {F_mask}, {F_bias}, {F_lse}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>;
+                return fmha_fwd_<trait_>(s, a);
+            }}
+"""
+
+@dataclass
+class FmhaFwdApiTrait:
+    pipeline_tag : str
+    # sync with fmha_fwd_traits<>, to generate fallback calls
+    hdim      : str
+    dtype     : str  # data type
+    mode      : str  # value from MODE_MAP
+    bm0       : int  # tile size along q seqlen (block size)
+    bn0       : int  # tile size along qk seqlen
+    bk0       : int  # tile size along qk gemm unroll
+    bn1       : int  # tile size along v head_dim
+    bk1       : int  # tile size along kv gemm unroll
+    bk0blen   : int 
+    vlayout   : str
+    mask      : str
+    bias      : str  # true/false
+    lse       : str  #
+    spad      : str
+    skpad     : str
+    dpad      : str
+    dvpad     : str
+
+    @property
+    def name(self) -> str:
+        return f'{self.hdim}-{self.dtype}-{self.mode}-{self.bm0}-{self.bn0}-{self.bk0}-{self.bn0}-{self.bk1}-{self.bk0blen}-'+\
+                    f'{self.vlayout}-{self.mask}-{self.bias}-{self.lse}-{self.spad}-{self.skpad}-{self.dpad}-{self.dvpad}'
+
+    @property
+    def scheck(self) -> str:
+        if self.pipeline_tag == 'qr_async':
+            if self.spad == 't' : return 'true' # always support
+            else :                return 'true'
+        elif self.pipeline_tag in ['qr', 'qr_fp8']:
+            if self.spad == 't' : return f'a.seqlen_q % {self.bm0} != 0'
+            else :                return f'a.seqlen_q % {self.bm0} == 0'
+        else: assert False
+
+    @property
+    def skcheck(self) -> str:
+        if self.skpad == 't' : return f'a.seqlen_k % {self.bn0} != 0'
+        else :                 return f'a.seqlen_k % {self.bn0} == 0'
+
+    @property
+    def dcheck(self) -> str:
+        if self.pipeline_tag == 'qr_async':
+            vec = int((32 * 4) / DTYPE_BITS[self.dtype])
+            if self.dpad == 't': return f'a.hdim_q % {vec} == 0'
+            else :               assert False
+        elif self.pipeline_tag in ['qr', 'qr_fp8']:
+            if self.dpad == 't': return f'a.hdim_q % {self.bk0blen} != 0'
+            else :               return f'a.hdim_q % {self.bk0blen} == 0'
+        else:   assert False
+
+    @property
+    def dvcheck(self) -> str:
+        if self.pipeline_tag == 'qr_async':
+            vec = int((32 * 4) / DTYPE_BITS[self.dtype])
+            if self.dvpad == 't': return f'a.hdim_v % {vec} == 0'
+            else :                assert False
+        elif self.pipeline_tag in ['qr', 'qr_fp8']:
+            if self.dvpad == 't': return f'a.hdim_v % {self.bk0blen} != 0'
+            else :                return f'a.hdim_v % {self.bk0blen} == 0'
+        else:   assert False
+
+@dataclass
+class FmhaFwdPipeline:
+    tag : str
+
+    F_vlayout   : str  # row/col
+    F_spad      : str  # true/false
+    F_skpad     : str  #
+    F_dpad      : str  #
+    F_dvpad     : str  #
+    F_bias      : str  # true/false
+    F_lse       : str  #
+    F_mask      : str  # value from MASK_MAP
+
+    @property
+    def name(self) -> str:
+        def pad_name() -> str:
+            n = ''
+            if self.F_spad == 't': n += 's'
+            if self.F_skpad == 't' : n += 'sk'
+            if self.F_dpad == 't' : n += 'd'
+            if self.F_dvpad == 't' : n += 'dv'
+            if n != '' : n = 'p' + n
+            return n
+        pn = pad_name()
+        n = f'{self.tag}_v{self.F_vlayout[0]}'
+        if pn != '' : n += f'_{pn}'
+        if self.F_bias == 't' : n += '_bias'
+        if self.F_mask != 'no' : n += f'_m{self.F_mask[0]}'
+        if self.F_lse == 't' : n += '_lse'
+        return n
+
+class FmhaFwdApiPool:
+    def __init__(self):
+        self.pool = dict()
+
+    def register_traits(self, trait : FmhaFwdApiTrait) -> None:
+        # TODO: do we need to check duplication?
+        if trait.dtype not in self.pool.keys():
+            self.pool[trait.dtype] = dict()
+        if trait.hdim not in self.pool[trait.dtype].keys():
+            self.pool[trait.dtype][trait.hdim] = list()
+
+        self.pool[trait.dtype][trait.hdim].append(copy.copy(trait))
+
+    @property
+    def api(self) -> str:
+        per_dtypes=str()
+        for i, dtype in enumerate(self.pool.keys()):
+            per_hdim_case=str()
+            for j, hdim in enumerate(self.pool[dtype].keys()):
+                traits=self.pool[dtype][hdim]
+                inners=str()
+                for k, trait in enumerate(traits):
+                    if_k = 'if' if k == 0 else 'else if'
+                    inners = inners + FMHA_FWD_API_INNER_DISPATCH.format(F_if=if_k, F_mode=MODE_MAP[trait.mode], F_vlayout=LAYOUT_MAP[trait.vlayout], F_mask=MASK_MAP[trait.mask],
+                                   F_mask_check=MASK_CHECK_MAP[trait.mask], F_bias=BOOL_MAP[trait.bias], F_lse=BOOL_MAP[trait.lse],
+                                   F_scheck=trait.scheck, F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck,
+                                   F_spad=BOOL_MAP[trait.spad], F_skpad=BOOL_MAP[trait.skpad], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad],
+                                   F_bm0=trait.bm0, F_bn0=trait.bn0, F_bk0=trait.bk0, F_bn1=trait.bn1, F_bk1=trait.bk1, F_bk0blen=trait.bk0blen,
+                                   F_hdim=hdim, F_dtype=DTYPE_MAP[dtype])
+                if_j = 'if' if j == 0 else 'else if'
+                per_hdim_case = per_hdim_case + FMHA_FWD_API_PER_HDIM_CASE.format(F_if=if_j, F_hdim=hdim, F_inner_dispatch=inners)
+            if_i = 'if' if i == 0 else 'else if'
+            per_dtypes = per_dtypes + FMHA_FWD_API_PER_DTYPE.format(F_if=if_i, F_dtype=dtype, F_hdim_case=per_hdim_case)
+        return FMHA_FWD_KERNEL_HEADER + FMHA_FWD_API.format(F_dispatch = per_dtypes)
+
+@dataclass
+class FmhaFwdTileSize:
+    F_bm0       : int  # tile size along q seqlen (block size)
+    F_bn0       : int  # tile size along qk seqlen
+    F_bk0       : int  # tile size along qk gemm unroll
+    F_bn1       : int  # tile size along v head_dim
+    F_bk1       : int  # tile size along kv gemm unroll
+    F_bk0blen   : int  # total length of K0, used for pipeline that need load Q at once (or repeately load Q as a whole tile)
+    F_rm        : int  # number of warps along q seqlen (block warps)
+    F_rn        : int  # number of warps along k seqlen(not used)
+    F_rk        : int  # number of warps along gemm-k(not used)
+    F_wm        : int  # warp size along m (warp size)
+    F_wn        : int  # warp size along n
+    F_wk        : int  # warp size along k
+    F_occupancy : int  # occupancy, -1 will let pipeline decide the occupancy, other value will overwrite occupancy
+    @property
+    def name(self) -> str:
+        return f"b{self.F_bm0}x{self.F_bn0}x{self.F_bk0}x{self.F_bn1}x{self.F_bk1}x{self.F_bk0blen}" +\
+        f"_r{self.F_rm}x{self.F_rn}x{self.F_rk}_w{self.F_wm}x{self.F_wn}x{self.F_wk}" +\
+            ("" if self.F_occupancy == -1 else f"_o{self.F_occupancy}")
+
+@dataclass
+class FmhaFwdKernel:
+    direction   : str
+    F_idx       : int  # this is not a tunable, but a counter to differentiate symbol    
+    F_hdim      : int  # hdim
+    F_dtype     : str  # data type
+    F_mode      : str  # value from MODE_MAP
+    F_tile      : FmhaFwdTileSize
+    F_pipeline  : FmhaFwdPipeline
+
+    @property
+    def template(self) -> str:
+        return FMHA_FWD_KERNEL_HEADER + \
+            FMHA_FWD_KERNEL_BODY.format(
+                F_idx       = self.F_idx,
+                F_hdim      = self.F_hdim,
+                F_dtype     = DTYPE_MAP[self.F_dtype],
+                F_bm0       = self.F_tile.F_bm0,
+                F_bn0       = self.F_tile.F_bn0,
+                F_bk0       = self.F_tile.F_bk0,
+                F_bn1       = self.F_tile.F_bn1,
+                F_bk1       = self.F_tile.F_bk1,
+                F_bk0blen   = self.F_tile.F_bk0blen,
+                F_rm        = self.F_tile.F_rm,
+                F_rn        = self.F_tile.F_rn,
+                F_rk        = self.F_tile.F_rk,
+                F_wm        = self.F_tile.F_wm,
+                F_wn        = self.F_tile.F_wn,
+                F_wk        = self.F_tile.F_wk,
+                F_vlayout   = LAYOUT_MAP[self.F_pipeline.F_vlayout],
+                F_spad      = BOOL_MAP[self.F_pipeline.F_spad],
+                F_skpad     = BOOL_MAP[self.F_pipeline.F_skpad],
+                F_dpad      = BOOL_MAP[self.F_pipeline.F_dpad],
+                F_dvpad     = BOOL_MAP[self.F_pipeline.F_dvpad],
+                F_bias      = BOOL_MAP[self.F_pipeline.F_bias],
+                F_lse       = BOOL_MAP[self.F_pipeline.F_lse],
+                F_occupancy = self.F_tile.F_occupancy ,
+                F_mask      = MASK_MAP[self.F_pipeline.F_mask],
+                F_mode      = MODE_MAP[self.F_mode],
+                F_pipeline  = PIPELINE_MAP[self.F_pipeline.tag])
+
+    @property
+    def name(self) -> str:
+        # TODO: we don't encode idx here
+        return f"fmha_{self.direction}_d{self.F_hdim}_{self.F_dtype}_{self.F_mode}_" +\
+                self.F_tile.name + '_' + self.F_pipeline.name
+
+    @property
+    def filename(self) -> str:
+        return self.name + ".cpp"
+
+    def api_trait(self) -> FmhaFwdApiTrait:
+        return FmhaFwdApiTrait(
+                pipeline_tag=self.F_pipeline.tag,
+                hdim=str(self.F_hdim),
+                dtype=self.F_dtype,
+                mode=self.F_mode,
+                bm0=self.F_tile.F_bm0,
+                bn0=self.F_tile.F_bn0,
+                bk0=self.F_tile.F_bk0,
+                bn1=self.F_tile.F_bn1,
+                bk1=self.F_tile.F_bk1,
+                bk0blen=self.F_tile.F_bk0blen,
+                vlayout=self.F_pipeline.F_vlayout,
+                mask=self.F_pipeline.F_mask,
+                bias=self.F_pipeline.F_bias,
+                lse=self.F_pipeline.F_lse,
+                spad=self.F_pipeline.F_spad,
+                skpad=self.F_pipeline.F_skpad,
+                dpad=self.F_pipeline.F_dpad,
+                dvpad=self.F_pipeline.F_dvpad)
+
+# TODO: design a more practical way to do it
+# this is current supported tile size per hdim
+def get_fmha_fwd_tile_dict_from_dtype(direction : str, dtype : str) -> Optional[dict]:
+    if direction == 'fwd':
+        if dtype == 'fp16' or dtype == 'bf16':
+            return {
+                 '32'  : FmhaFwdTileSize(128, 64, 16, 32, 32, 32,     2, 1, 1, 32, 32, 16, -1),
+                 '64'  : FmhaFwdTileSize(128, 64, 32, 64, 32, 64,     4, 1, 1, 32, 32, 16, -1),
+                 '128' : FmhaFwdTileSize(128, 128, 32, 128, 32, 128,  4, 1, 1, 32, 32, 16, -1),
+                 '256' : FmhaFwdTileSize(128, 128, 32, 256, 32, 256,  4, 1, 1, 32, 32, 16, -1),
+            }
+        elif dtype == 'fp8' or dtype == 'bf8':
+            return {
+                '128' : FmhaFwdTileSize(128, 128, 32, 128, 32, 128,  4, 1, 1, 32, 32, 32, -1)
+            }
+        else:
+            return None
+    else:
+        return None
+
+def get_blobs() -> tuple[FmhaFwdApiPool, List[FmhaFwdKernel]]:
+    # TODO: we don't support tuning yet, so pick up one value for vlayout/pipeline/pad
+    #       support this in future
+    def get_pipelines(dtype, hdim) -> List[FmhaFwdPipeline]:
+        # this function will populate a list possible pipelines
+        pipelines = []
+        if dtype in ['fp16', 'bf16']:
+            for mask, bias, lse in itertools.product(MASK_MAP.keys(), ["t", "f"], ["t", "f"]):
+                #if hdim == 256:
+                if True:
+                    pipelines.append(FmhaFwdPipeline('qr', 'row', 'f', 'f', 'f', 'f', bias, lse, mask))
+                    pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', bias, lse, mask))
+
+                    pipelines.append(FmhaFwdPipeline('qr', 'row', 't', 't', 't', 't', bias, lse, mask))
+                    pipelines.append(FmhaFwdPipeline('qr', 'col', 't', 't', 't', 't', bias, lse, mask))
+                #else:
+                #    pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 'f', 't', 't', bias, lse, mask))
+                #    pipelines.append(FmhaFwdPipeline('qr_async', 'row', 't', 't', 't', 't', bias, lse, mask))
+                #    pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 'f', 't', 't', bias, lse, mask))
+                #    pipelines.append(FmhaFwdPipeline('qr_async', 'col', 't', 't', 't', 't', bias, lse, mask))
+        elif dtype in ['fp8', 'bf8']:
+            # no need lse kernels
+            for mask, bias in itertools.product(MASK_MAP.keys(), ["t", "f"]):
+                pipelines.append(FmhaFwdPipeline('qr_fp8', 'col', 'f', 'f', 'f', 'f', bias, 'f', mask))
+        else:
+            assert Fasle
+        return pipelines
+
+    gen = list()
+    api_pool = FmhaFwdApiPool()
+
+    for direction, dtype in itertools.product(DIRECTIONS, DTYPE_MAP.keys()):
+        d = get_fmha_fwd_tile_dict_from_dtype(direction, dtype)
+        if d == None:
+            continue
+        #for hdim_str, mode, mask, bias, lse in itertools.product(d.keys(), MODE_MAP.keys(), MASK_MAP.keys(), ["t", "f"], ["t", "f"]):
+        for hdim_str, mode in itertools.product(d.keys(), MODE_MAP.keys()):
+            tile = d[hdim_str]
+            hdim = int(hdim_str)
+            for pipeline in get_pipelines(dtype, hdim):
+                k = FmhaFwdKernel(direction=direction, F_idx=0, F_hdim=hdim, F_dtype=dtype, F_mode=mode, F_tile=tile, F_pipeline=pipeline)
+                api_pool.register_traits(k.api_trait())
+                gen.append(k)
+
+    return (api_pool, gen)
+
+def write_single_kernel(kernel: FmhaFwdKernel, autogen_dir: Path) -> None:
+    (autogen_dir / kernel.filename).write_text(kernel.template)
+
+def write_api(api_pool : FmhaFwdApiPool, autogen_dir: Path) -> None:
+    (autogen_dir / FMHA_FWD_API_FILENAME).write_text(api_pool.api)
+
+def write_blobs(output_dir: Optional[str]) -> None:
+    if output_dir is None:
+        output_dir = Path(__file__).parent
+    else:
+        output_dir = Path(output_dir) / GEN_DIR
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+    api_pool, kernels = get_blobs()
+    for kernel in kernels:
+        write_single_kernel(kernel, output_dir)
+    write_api(api_pool, output_dir)
+
+# list all the files that will be generated
+def list_blobs(output_file: Optional[str]) -> None:
+    assert output_file is not None
+    file_path = Path(output_file)
+    with file_path.open('a') as f:
+        _, kernels = get_blobs()
+        for kernel in kernels:
+            f.write(str(file_path.parent / GEN_DIR / kernel.filename) + "\n")
+        f.write(str(file_path.parent / GEN_DIR / FMHA_FWD_API_FILENAME) + "\n")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog="generate",
+        description="gen api for CK fmha kernel",
+    )
+    parser.add_argument(
+        "-o",
+        "--output_dir",
+        required=False,
+        help="write all the blobs into a directory"
+    )
+    parser.add_argument(
+        "-l",
+        "--list_blobs",
+        required=False,
+        help="list all the kernels to a file"
+    )
+    args = parser.parse_args()
+    if args.list_blobs is not None:
+        list_blobs(args.list_blobs)
+    else:
+        write_blobs(args.output_dir)
--- a/example/ck_tile/01_fmha/mask.hpp
+++ b/example/ck_tile/01_fmha/mask.hpp
@@ -0,0 +1,108 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <ostream>
+#include <string>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/fmha.hpp"
+
+enum class mask_enum
+{
+    no_mask = 0,
+    causal_top_left,
+    causal_bottom_right,
+    window_generic,
+};
+
+struct mask_info
+{
+    mask_enum type;
+    ck_tile::index_t y, x;
+
+    void serialize(std::ostream& os) const
+    {
+        if(type == mask_enum::no_mask)
+            os << "n";
+        else if(type == mask_enum::causal_top_left)
+            os << "tl";
+        else if(type == mask_enum::causal_bottom_right)
+            os << "br";
+        else
+        {
+            os << "g(" << y << "/" << x << ")";
+        }
+    }
+    static mask_info decode(std::string str, ck_tile::index_t seqlen_q, ck_tile::index_t seqlen_k)
+    {
+        ck_tile::index_t x_total = seqlen_k;
+        ck_tile::index_t y_total = seqlen_q;
+        mask_info tmp;
+        auto found_0 = str.find(':');
+        if(found_0 != std::string::npos)
+        {
+            std::string t = str.substr(0, found_0);
+            std::string v = str.substr(found_0 + 1);
+            auto found_1  = v.find(",");
+            if(found_1 == std::string::npos)
+            {
+                printf("not supported value %s, %s\n", v.c_str(), str.c_str());
+                assert(0);
+            }
+            tmp.type       = mask_enum::window_generic;
+            ck_tile::index_t v0 = atoi(v.substr(0, found_1).c_str());
+            ck_tile::index_t v1 = atoi(v.substr(found_1 + 1).c_str());
+            // TODO: some validation
+            if(t == "t")
+            {
+                auto r = ck_tile::make_generic_attention_mask_coordinates_from_lr_window(
+                    v0, v1, y_total, x_total, true);
+                tmp.y = r.at(ck_tile::number<0>{});
+                tmp.x = r.at(ck_tile::number<1>{});
+            }
+            else if(t == "b")
+            {
+                auto r = ck_tile::make_generic_attention_mask_coordinates_from_lr_window(
+                    v0, v1, y_total, x_total, false);
+                tmp.y = r.at(ck_tile::number<0>{});
+                tmp.x = r.at(ck_tile::number<1>{});
+            }
+            else if(t == "g")
+            {
+                tmp.y = v0;
+                tmp.x = v1;
+            }
+            else
+            {
+                printf("not supported type %s, %s\n", t.c_str(), str.c_str());
+                assert(0);
+            }
+        }
+        else
+        {
+            // should be 0, 1, 2
+            tmp.type = static_cast<mask_enum>(atoi(str.c_str()));
+            if(tmp.type == mask_enum::causal_top_left)
+            {
+                tmp.y = seqlen_q;
+                tmp.x = 1;
+            }
+            else if(tmp.type == mask_enum::causal_bottom_right)
+            {
+                tmp.y = seqlen_q;
+                tmp.x = seqlen_k - seqlen_q + 1;
+            }
+        }
+        return tmp;
+    }
+
+    friend std::ostream& operator<<(std::ostream& os, const mask_info& mi);
+};
+
+inline std::ostream& operator<<(std::ostream& os, const mask_info& mi)
+{
+    mi.serialize(os);
+    return os;
+}
--- a/example/ck_tile/01_fmha/misc/gamc.png
+++ b/example/ck_tile/01_fmha/misc/gamc.png
--- a/example/ck_tile/01_fmha/script/benchmark.sh
+++ b/example/ck_tile/01_fmha/script/benchmark.sh
@@ -0,0 +1,21 @@
+#!/bin/sh
+# TODO: run this script from CK root
+BUILD=build
+EXE=$BUILD/bin/example_fmha_fwd
+VALID=0
+
+for prec in "fp16" "bf16" ; do
+for perm in 0 1 ; do
+for hdim in 64 128 256 ; do
+
+nhead=$((2048 / $hdim))     # follow fav2 setup
+$EXE -prec=$prec -b=32 -h=$nhead -d=$hdim -s=512   -iperm=$perm -operm=$perm -v=$VALID ; sleep 3
+$EXE -prec=$prec -b=16 -h=$nhead -d=$hdim -s=1024  -iperm=$perm -operm=$perm -v=$VALID ; sleep 3
+$EXE -prec=$prec -b=8  -h=$nhead -d=$hdim -s=2048  -iperm=$perm -operm=$perm -v=$VALID ; sleep 3
+$EXE -prec=$prec -b=4  -h=$nhead -d=$hdim -s=4096  -iperm=$perm -operm=$perm -v=$VALID ; sleep 3
+$EXE -prec=$prec -b=2  -h=$nhead -d=$hdim -s=8192  -iperm=$perm -operm=$perm -v=$VALID ; sleep 3
+$EXE -prec=$prec -b=1  -h=$nhead -d=$hdim -s=16384 -iperm=$perm -operm=$perm -v=$VALID ; sleep 3
+
+done
+done
+done
--- a/example/ck_tile/01_fmha/script/smoke_test.sh
+++ b/example/ck_tile/01_fmha/script/smoke_test.sh
@@ -0,0 +1,34 @@
+#!/bin/sh
+# TODO: run this script from CK root
+BUILD=build
+EXE=$BUILD/bin/example_fmha_fwd
+KNAME=1
+
+export CK_WARMUP=0
+export CK_REPEAT=1
+
+COMMON_ARGS='-v=1 -warmup=0 -repeat=1'
+mode=0
+
+for prec in "fp16" "bf16" ; do
+# for mode in 1 0 ; do
+for perm in 0 1 ; do
+for vlayout in "r" "c" ; do
+for hdim in 32 64 128 256 ; do
+for lse in 0 1 ; do
+for bias in 0 1 ; do
+
+$EXE -prec=$prec -mode=$mode -b=1 -h=1 -d=$hdim -s=1024 -bias=$bias -lse=$lse -iperm=$perm -operm=$perm -vlayout=$vlayout -kname=$KNAME $COMMON_ARGS
+$EXE -prec=$prec -mode=$mode -b=2 -h=2 -h_k=1 -d=16, -d_v=$hdim -s=55 -s_k=256 -bias=$bias -lse=$lse -iperm=$perm -operm=$perm -vlayout=$vlayout -kname=$KNAME $COMMON_ARGS
+$EXE -prec=$prec -mode=$mode -b=1 -h=3 -d=$hdim -s=100 -s_k=51 -bias=$bias -lse=$lse -iperm=$perm -operm=$perm -vlayout=$vlayout -kname=$KNAME $COMMON_ARGS
+$EXE -prec=$prec -mode=$mode -b=1 -h=1 -d=16 -d_v=$hdim -s=99 -s_k=256 -bias=$bias -lse=$lse -iperm=$perm -operm=$perm -mask=1 -vlayout=$vlayout -kname=$KNAME $COMMON_ARGS
+$EXE -prec=$prec -mode=$mode -b=1 -h=1 -d=$hdim -s=1024 -s_k=256 -bias=$bias -lse=$lse -iperm=$perm -operm=$perm -mask=2 -vlayout=$vlayout -kname=$KNAME $COMMON_ARGS
+$EXE -prec=$prec -mode=$mode -b=1 -h=1 -d=$hdim -s=256 -s_k=512 -bias=$bias -lse=$lse -iperm=$perm -operm=$perm -mask=g:128,32 -vlayout=$vlayout -kname=$KNAME $COMMON_ARGS
+
+done
+done
+done
+done
+done
+done
+#done
--- a/example/ck_tile/01_fmha/utils.hpp
+++ b/example/ck_tile/01_fmha/utils.hpp
@@ -0,0 +1,91 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdint>
+#include <optional>
+#include <ostream>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "ck_tile/core/container/span.hpp"
+
+enum class mode_enum
+{
+    batch = 0,
+    group
+};
+
+std::ostream& operator<<(std::ostream& stream, mode_enum mode)
+{
+    return stream << (mode == mode_enum::batch ? "batch" : "group");
+}
+
+std::vector<int32_t> to_seqstarts(ck_tile::span<const int32_t> seqlens)
+{
+    std::vector<int32_t> seqstarts = {0};
+    for(int32_t seqlen : seqlens)
+    {
+        seqstarts.push_back(seqstarts.back() + seqlen);
+    }
+    assert(seqstarts.size() == seqlens.size() + 1);
+    return seqstarts;
+}
+
+std::vector<int32_t> generate_seqlens(mode_enum mode,
+                                      unsigned count,
+                                      int32_t seqlens_sum,
+                                      std::optional<unsigned> seed = std::nullopt)
+{
+    assert(0 < count);
+
+    std::vector<int32_t> seqlens(count, seqlens_sum);
+
+    if(mode == mode_enum::group && 1 < count)
+    {
+        using size_type = std::vector<int32_t>::size_type;
+
+        std::mt19937 random_engine(seed.has_value() ? *seed : std::random_device{}());
+        std::uniform_int_distribution<size_type> idx_dist(0, count - 1);
+        auto next_idx = std::bind(idx_dist, std::ref(random_engine));
+
+        std::uniform_int_distribution<size_type> step_dist(1, count - 1);
+        auto next_step = std::bind(step_dist, std::ref(random_engine));
+
+        for(unsigned repeat = seqlens_sum * (count / 2); 0 < repeat; --repeat)
+        {
+            const size_type to_decrease = next_idx();
+            // make sure each elements of seqlens is always greater than 0
+            if(seqlens[to_decrease] == 1)
+            {
+                continue;
+            }
+
+            const size_type to_increase = (to_decrease + next_step()) % count;
+
+            --seqlens[to_decrease];
+            ++seqlens[to_increase];
+        }
+    }
+
+    return seqlens;
+}
+
+std::vector<int32_t> generate_seqstarts(mode_enum mode,
+                                        unsigned count,
+                                        int32_t seqlens_sum,
+                                        std::optional<unsigned> seed = std::nullopt)
+{
+    return to_seqstarts(generate_seqlens(mode, count, seqlens_sum, seed));
+}
+
+int env_get_int(const char* var_name, int default_int)
+{
+    char* v = getenv(var_name);
+    int r   = default_int;
+    if(v)
+        r = atoi(v);
+    return r;
+}