[CK_TILE] Migrate CK Tile examples to Tests to autorun on CI (#2421)

[CK_TILE] Add new ck tile unit test * Add new ck tile unit test smoke-gemm-universal * Add new ck tile unit test smoke-gemm-basic * Add new ck tile unit test topk_softmax * Add new ck tile unit test add_rmsnorm2d_rdquant_fwd
2026-05-04 05:31:24 +00:00 · 2025-07-22 08:15:18 -06:00
parent c9886109b4
commit f102eedfb3
61 changed files with 6298 additions and 0 deletions
--- a/test/ck_tile/layernorm2d/CMakeLists.txt
+++ b/test/ck_tile/layernorm2d/CMakeLists.txt
@@ -0,0 +1,53 @@
+function(create_tile_layernorm2d_fwd SUFFIX)
+    set(TEST_CK_TILE_LAYERNORM2D_FWD "test_ck_tile_layernorm2d_fwd_${SUFFIX}")
+
+    message(DEBUG "adding example ${TEST_CK_TILE_LAYERNORM2D_FWD}")
+    add_test_executable(${TEST_CK_TILE_LAYERNORM2D_FWD} layernorm2d_fwd_${SUFFIX}.cpp)
+    target_include_directories(${TEST_CK_TILE_LAYERNORM2D_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
+    target_sources(${TEST_CK_TILE_LAYERNORM2D_FWD} PRIVATE ${LAYERNORM2D_FWD_GEN_BLOBS})
+
+    set(TEST_CK_TILE_LAYERNORM2D_FWD_COMPILE_OPTIONS)
+
+    # NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
+    list(APPEND TEST_CK_TILE_LAYERNORM2D_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal --offload-compress)
+
+    target_compile_options(${TEST_CK_TILE_LAYERNORM2D_FWD} PRIVATE ${TEST_CK_TILE_LAYERNORM2D_FWD_COMPILE_OPTIONS})
+endfunction()
+
+if(GPU_TARGETS MATCHES "gfx9")
+    set(LAYERNORM2D_FWD_KNOWN_APIS "fwd;bwd")
+    set(LAYERNORM2D_FWD_ENABLE_APIS  "fwd" CACHE STRING
+        "semicolon-separated list of APIs to generate (${LAYERNORM2D_FWD_KNOWN_APIS}) & link, or \"all\".")
+    if(LAYERNORM2D_FWD_ENABLE_APIS  STREQUAL "all")
+        set(LAYERNORM2D_FWD_ENABLE_APIS  ${LAYERNORM2D_FWD_KNOWN_APIS})
+    endif()
+
+    # generate a list of kernels, but not actually emit files at config sta
+    execute_process(
+        COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/generate.py
+        --api ${LAYERNORM2D_FWD_ENABLE_APIS} --working_path ${CMAKE_CURRENT_BINARY_DIR} --list_blobs
+        RESULT_VARIABLE ret
+        )
+    if(ret AND NOT ret EQUAL 0)
+        message( FATAL_ERROR "Fail to generate kernels via Python. ${ret}")
+    endif()
+
+    file(STRINGS ${CMAKE_CURRENT_BINARY_DIR}/layernorm2d_fwd_blobs.txt LAYERNORM2D_FWD_GEN_BLOBS)
+
+    add_custom_command(
+        OUTPUT ${LAYERNORM2D_FWD_GEN_BLOBS}
+        COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/generate.py
+        --api ${LAYERNORM2D_FWD_ENABLE_APIS} --working_path ${CMAKE_CURRENT_BINARY_DIR} --gen_blobs
+        )
+
+    create_tile_layernorm2d_fwd("fp16")
+    create_tile_layernorm2d_fwd("bf16")
+
+# TODO: we have to turn off this global prop, otherwise the progress bar generated
+# by cmake will print too many files, execvp: /bin/sh: Argument list too long
+# however, this property may affect global
+# TODO: consider codegen a makefile by us
+    set_property(GLOBAL PROPERTY RULE_MESSAGES OFF)
+else()
+    message(DEBUG "Skipping ck tile add_rmsnorm2d_rdquant_fwd tests for current target")
+endif()
--- a/test/ck_tile/layernorm2d/generate.py
+++ b/test/ck_tile/layernorm2d/generate.py
@@ -0,0 +1,730 @@
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+# generate kernel instances to speed up compilation
+
+import argparse
+from enum import IntEnum
+from pathlib import Path
+import sys
+from typing import List, Optional, Any
+import functools
+import itertools
+import copy
+from dataclasses import dataclass
+
+def get_if_str(idx, total, lase_else = True):
+    if idx == 0:
+        return 'if'
+    elif idx < total - 1:
+        return 'else if'
+    else:
+        if lase_else:
+            return 'else'
+        else:
+            return 'else if'
+
+XBIAS_ENUM_STR_MAP = [
+    'no',
+    'xbias']      # pre-norm add bias
+
+FUSED_ADD_ENUM_STR_MAP = [
+    'no',
+    'pras',      # pre-norm
+    'pra' ]      # post-norm
+
+FUSED_FUSED_SWEEP_STR_MAP = [
+    'no',
+    'dquant' ]
+
+DATA_TYPE_MAP = {'fp32' : 'float',
+                 'fp16' : 'ck_tile::fp16_t',
+                 'bf16' : 'ck_tile::bf16_t',
+                 'int8' : 'ck_tile::int8_t',
+                 'fp8'  : 'ck_tile::fp8_t'}
+
+def BOOL_MAP(b_) -> str:
+    if b_:
+        return 'true'
+    else:
+        return 'false'
+
+class layernorm_fwd_codegen:
+    API_TRAITS_DEFINE = """
+// this is used to pattern-match internl kernel implementation, not to instantiate kernel
+template <typename XDataType_,
+          typename YDataType_,
+          typename SmoothScaleDataType_,
+          typename YScaleDataType_,
+          ck_tile::index_t Repeat_M_,         // each thread repeat along M
+          ck_tile::index_t Repeat_N_,         // each thread repeat along N
+          ck_tile::index_t ThreadPerBlock_M_, // num threads along M
+          ck_tile::index_t ThreadPerBlock_N_, // num threads along N
+          ck_tile::index_t Vector_N_,         // vector size along N
+          bool kPadN_,
+          bool kSaveMeanInvStd_,
+          bool kFastFDiv_,
+          bool kWelford_,
+          bool kTwoPass_,
+          ck_tile::index_t kXbias_ = 0,
+          ck_tile::index_t kFusedAdd_ = 0,
+          ck_tile::index_t kFusedQuant_ = 0>
+struct layernorm2d_fwd_traits_
+{
+    using XDataType = ck_tile::remove_cvref_t<XDataType_>;
+    using YDataType = ck_tile::remove_cvref_t<YDataType_>;
+    using SmoothScaleDataType = ck_tile::remove_cvref_t<SmoothScaleDataType_>;
+    using YScaleDataType = ck_tile::remove_cvref_t<YScaleDataType_>;
+
+    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= ck_tile::get_warp_size();
+    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % ck_tile::get_warp_size() == 0);
+    static constexpr ck_tile::index_t total_warps =
+        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / ck_tile::get_warp_size();
+
+    // num of warps along m
+    static constexpr ck_tile::index_t BlockWarps_M = []() {
+        if constexpr(is_warp_per_row)
+        {
+            static_assert(ck_tile::get_warp_size() % ThreadPerBlock_N_ == 0);
+            return total_warps * (ck_tile::get_warp_size() / ThreadPerBlock_N_);
+        }
+        else
+        {
+            // static_assert(ck_tile::get_warp_size() % ThreadPerBlock_M_ == 0);
+            return total_warps / (ThreadPerBlock_N_ / ck_tile::get_warp_size());
+        }
+    }();
+
+    // num of warps along n
+    static constexpr ck_tile::index_t BlockWarps_N = []() {
+        if constexpr(is_warp_per_row)
+        {
+            static_assert(ck_tile::get_warp_size() % ThreadPerBlock_N_ == 0);
+            return 1;
+        }
+        else
+        {
+            static_assert(ThreadPerBlock_N_ % ck_tile::get_warp_size() == 0);
+            return ThreadPerBlock_N_ / ck_tile::get_warp_size();
+        }
+    }();
+
+    static constexpr ck_tile::index_t Repeat_M = Repeat_M_;
+    static constexpr ck_tile::index_t Repeat_N = Repeat_N_;
+
+    static constexpr ck_tile::index_t Block_M = Repeat_M_ * ThreadPerBlock_M_;
+    static constexpr ck_tile::index_t Block_N = Repeat_N_ * ThreadPerBlock_N_ * Vector_N_;
+
+    static constexpr ck_tile::index_t Warp_M = ThreadPerBlock_M_ / BlockWarps_M;
+    static constexpr ck_tile::index_t Warp_N = ThreadPerBlock_N_ / BlockWarps_N * Vector_N_;
+
+    using BlockTile  = ck_tile::sequence<Block_M, Block_N>;
+    using BlockWarps = ck_tile::sequence<BlockWarps_M, BlockWarps_N>;
+    using WarpTile   = ck_tile::sequence<Warp_M, Warp_N>;
+    using Vector     = ck_tile::sequence<1, Vector_N_>;
+
+    using Shape = ck_tile::Generic2dBlockShape<BlockTile, BlockWarps, WarpTile, Vector>;
+
+    static constexpr bool kPadN           = kPadN_;
+    static constexpr bool kSaveMeanInvStd = kSaveMeanInvStd_;
+    static constexpr bool kFastFDiv       = kFastFDiv_;
+    static constexpr bool kWelford        = kWelford_;
+    static constexpr bool kTwoPass        = kTwoPass_;
+    static constexpr ck_tile::index_t kXbias = kXbias_;
+    static constexpr ck_tile::index_t kFusedAdd = kFusedAdd_;
+    static constexpr ck_tile::index_t kFusedQuant = kFusedQuant_;
+};
+
+template <typename XDataType_,
+          typename YDataType_,
+          typename SmoothScaleDataType_,
+          typename YScaleDataType_,
+          ck_tile::index_t Repeat_M_,         // each thread repeat along M
+          ck_tile::index_t Repeat_N_,         // each thread repeat along N
+          ck_tile::index_t ThreadPerBlock_M_, // num threads along M
+          ck_tile::index_t ThreadPerBlock_N_, // num threads along N
+          ck_tile::index_t Vector_N_,         // vector size along N
+          bool kPadN_,
+          bool kSaveMeanInvStd_,
+          bool kFastFDiv_,
+          bool kWelford_,
+          bool kTwoPass_,
+          int  kXbias_,
+          int  kFusedAdd_,
+          int  kFusedQuant_>
+using traits_ = layernorm2d_fwd_traits_<XDataType_,
+                                       YDataType_,
+                                       SmoothScaleDataType_,
+                                       YScaleDataType_,
+                                       Repeat_M_,
+                                       Repeat_N_,
+                                       ThreadPerBlock_M_,
+                                       ThreadPerBlock_N_,
+                                       Vector_N_,
+                                       kPadN_,
+                                       kSaveMeanInvStd_,
+                                       kFastFDiv_,
+                                       kWelford_,
+                                       kTwoPass_,
+                                       kXbias_,
+                                       kFusedAdd_,
+                                       kFusedQuant_>;
+"""
+    API_COMMON_HEADER = """
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <ck_tile/core.hpp>
+#include "layernorm2d_fwd.hpp"
+#include <ck_tile/ops/epilogue.hpp>
+#include <iostream>
+
+#pragma once
+
+using S = ck_tile::stream_config;
+using A = layernorm2d_fwd_args;
+
+{F_traits_define}
+
+template <typename Traits_>
+float layernorm2d_fwd_(const S& s, A a)
+{{
+    using XDataType = typename Traits_::XDataType;
+    using YDataType = typename Traits_::YDataType;
+    using SmoothScaleDataType = typename Traits_::SmoothScaleDataType;
+    using YScaleDataType = typename Traits_::YScaleDataType;
+    using ComputeDataType = typename LayerNormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::ComputeDataType;
+
+    using PipelineTraits = ck_tile::Layernorm2dFwdTraits<Traits_::kPadN,
+        Traits_::kSaveMeanInvStd,
+        Traits_::kFastFDiv,
+        Traits_::kWelford,
+        Traits_::kTwoPass,
+        static_cast<ck_tile::Layernorm2dXBiasEnum>(Traits_::kXbias),
+        static_cast<ck_tile::Layernorm2dFusedAddEnum>(Traits_::kFusedAdd),
+        static_cast<ck_tile::Layernorm2dFusedQuantEnum>(Traits_::kFusedQuant)>;
+    using PipelineProblem = ck_tile::Layernorm2dFwdPipelineProblem<
+        typename LayerNormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::XDataType,
+        typename LayerNormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::XBiasDataType,
+        typename LayerNormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::GammaDataType,
+        typename LayerNormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::BetaDataType,
+        typename LayerNormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::ComputeDataType,
+        typename LayerNormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::YDataType,
+        typename LayerNormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::MeanDataType,
+        typename LayerNormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::InvStdDataType,
+        typename LayerNormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::SmoothScaleDataType,
+        typename LayerNormTypeConfig<XDataType, YDataType, SmoothScaleDataType, YScaleDataType>::YScaleDataType,
+        typename Traits_::Shape,
+        PipelineTraits>;
+
+    using OnePassPipeline = ck_tile::Layernorm2dFwdPipelineOnePass<PipelineProblem>;
+    using TwoPassPipeline = ck_tile::Layernorm2dFwdPipelineTwoPass<PipelineProblem>;
+    using Pipeline        = std::conditional_t<Traits_::kTwoPass, TwoPassPipeline, OnePassPipeline>;
+
+    using Default2DEpilogueProblem = ck_tile::Default2DEpilogueProblem<ComputeDataType, YDataType, false, Traits_::kPadN, true>;
+    using Default2DEpilogue = ck_tile::Default2DEpilogue<Default2DEpilogueProblem>;
+
+    static constexpr bool UseSmoothInputScale = Traits_::kFusedQuant == 1;
+    static constexpr bool UseRawStore = sizeof(YDataType) == 4;
+    using DynamicQuantEpilogueProblem = ck_tile::DynamicQuantEpilogueProblem<ComputeDataType, SmoothScaleDataType, YScaleDataType, YDataType, typename Traits_::Shape,
+            ck_tile::DynamicQuantEpilogueTraits<false, Traits_::kPadN, UseSmoothInputScale, UseRawStore,  true/*max3*/>>;
+
+    using DynamicQuantEpilogue = ck_tile::DynamicQuantEpilogue<DynamicQuantEpilogueProblem>;
+
+    using Epilogue = std::conditional_t<Traits_::kFusedQuant == 1, DynamicQuantEpilogue,  Default2DEpilogue>;
+
+    using Kernel = ck_tile::Layernorm2dFwd<Pipeline, Epilogue>;
+
+    const dim3 grids                       = Kernel::GridSize(a);
+    constexpr dim3 blocks                  = Kernel::BlockSize();
+    constexpr ck_tile::index_t kBlockPerCu = 1;
+
+    auto kargs = Kernel::MakeKargs(a);
+    if(s.log_level_ > 0)
+        std::cout << ", " << Kernel::GetName() << std::flush;
+
+    return ck_tile::launch_kernel(
+        s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{{}}, grids, blocks, 0, kargs));
+}}
+
+"""
+
+    API_BASE = """
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <ck_tile/core.hpp>
+#include "layernorm2d_fwd.hpp"
+
+{F_traits_define}
+
+// Note: this internal API only declare, not define here, otherwise will block `make -j`
+template <typename Traits_>
+float layernorm2d_fwd_(const ck_tile::stream_config& s, layernorm2d_fwd_args a);
+
+float layernorm2d_fwd(layernorm2d_fwd_traits t,
+                      layernorm2d_fwd_args a,
+                      const ck_tile::stream_config& s)
+{{
+    float r = -1;
+{F_dispatch}
+    return r;
+}}
+
+"""
+
+    API_PER_DTYPE="""    {F_if}(t.prec_i == \"{F_i_type}\" && t.prec_o == \"{F_o_type}\"){{
+{F_per_n_case}
+    }}
+"""
+    API_PER_N_CASE="""        {F_if} {F_N_COND} {{
+{F_inner_dispatch}
+        }}
+"""
+    API_INNER_CASE="""            {F_if} {F_VEC_COND}
+                r={F_instance_func}(s, a);
+"""
+
+    INSTANCE_BASE = """
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "layernorm2d_fwd_api_common.hpp"
+
+// clang-format off
+//                                      prec_i           prec_o           prec_sy           rm  rn  tm    tn  vn  pd     mv    rpcf   welford   2p   xbias   add  sweep
+{F_instance_def}
+// clang-format on
+
+"""
+
+    def __init__(self, working_path, kernel_filter):
+        self.working_path = working_path
+        self.kernel_filter = kernel_filter
+
+    class k_xbias_enum(IntEnum):
+        F_NO_XBIAS = 0
+        F_ADD_XBIAS = 1
+
+    class k_fuesd_add_enum(IntEnum):
+        F_NO_ADD = 0
+        F_PRE_ADD = 1
+        F_PRE_ADD_STORE_RESIDUAL = 2
+
+    class k_fused_sweep_enum(IntEnum):
+        F_NO_SWEEP = 0
+        F_RENORM = 1
+        F_DYNAMIC_QUANT = 2
+
+    @dataclass
+    class k_traits:
+        F_kPadN : bool
+        F_kSaveMeanInvStd : bool
+        F_kTwoPass : bool
+        F_kXbias : Any #: layernorm_fwd_codegen.k_bias_enum
+        F_kFusedAdd : Any #: layernorm_fwd_codegen.k_fuesd_add_enum
+        F_kFusedQuant : Any  #: layernorm_fwd_codegen.k_fused_sweep_enum
+
+    @dataclass
+    class k_shape:
+        F_BlockTile    : List[int]
+        F_WarpPerBlock : List[int]
+        F_WarpTile     : List[int]
+        F_Vector_      : List[int]
+        @property
+        def F_BlockSize(self) -> int:
+            return functools.reduce(lambda a, b: a*b, self.F_WarpTile)
+
+    @dataclass
+    class k_problem:
+        F_XDataType       : str
+        F_XBiasDataType   : str
+        F_GammaDataType   : str
+        F_BetaDataType    : str
+        F_ComputeDataType : str
+        F_YDataType       : str
+        F_MeanDataType    : str
+        F_InvStdDataType  : str
+        F_BlockShape      : str
+        F_Traits          : Any #k_traits
+
+    @dataclass
+    class k_pipeline_one_pass:
+        F_Problem         : Any #k_problem
+    
+    @dataclass
+    class k_pipeline_two_pass:
+        F_Problem         : Any #k_problem
+
+    @dataclass
+    class default_2d_epilogue_problem:
+        F_AccDataType : str
+        F_ODataType : str
+        F_kPadM : bool
+        F_kPadN : bool
+
+    @dataclass
+    class default_2d_epilogue:
+        F_problem : Any
+
+    @dataclass
+    class k_kernel:
+        F_pipeline : Any
+        F_epilogue : Any
+
+    @dataclass
+    class h_traits:
+        F_XDataType : str
+        F_YDataType : str
+        F_SmoothScaleDataType : str
+        F_YScaleDataType : str
+        F_Repeat_M : int
+        F_Repeat_N : int
+        F_ThreadPerBlock_M : int
+        F_ThreadPerBlock_N : int
+        F_Vector_N : int
+        F_kPadN : bool
+        F_kSaveMeanInvStd_ : bool
+        F_kFastFDiv_ : bool
+        F_kWelford_ : bool
+        F_kTwoPass_ : bool
+        F_kXbias_ : int
+        F_kFusedAdd : int
+        F_kFusedQuant : int
+
+        @property
+        def trait_name(self) ->str:
+            t_ = f'{DATA_TYPE_MAP[self.F_XDataType]}, {DATA_TYPE_MAP[self.F_YDataType]}, {DATA_TYPE_MAP[self.F_SmoothScaleDataType]}, {DATA_TYPE_MAP[self.F_YScaleDataType]}, {self.F_Repeat_M:2}, {self.F_Repeat_N:2}, {self.F_ThreadPerBlock_M:2}, {self.F_ThreadPerBlock_N:4}'
+            t_ += f', {self.F_Vector_N:2}, {BOOL_MAP(self.F_kPadN):5}, {BOOL_MAP(self.F_kSaveMeanInvStd_):5}, {BOOL_MAP(self.F_kFastFDiv_):5}, {BOOL_MAP(self.F_kWelford_):5}'
+            t_ += f', {BOOL_MAP(self.F_kTwoPass_):5}, {self.F_kXbias:4}, {self.F_kFusedAdd:4}, {self.F_kFusedQuant:4}'
+            return t_
+
+        # string when calling this kernel
+        @property
+        def call_name(self) -> str:
+            return f'layernorm2d_fwd_<traits_<{self.trait_name}>>'
+
+        # string when define this kernel
+        @property
+        def def_name(self) -> str:
+            return f'template float layernorm2d_fwd_<traits_<{self.trait_name}>>(const S&, A);'
+
+    # this class hold kernel under same source file
+    @dataclass
+    class h_instance:
+        F_DataTypePair : str
+        F_N : str
+        F_xbias : int
+        F_add : int
+        F_sweep : int
+        instance_list : List[Any] # List[h_traits]
+
+        @property
+        def name(self) -> str:
+            prec_i, prec_o = self.F_DataTypePair.split(',')
+            dtype_str = f'{prec_i}' if prec_i == prec_o else f'{prec_i}_{prec_o}'
+            nnn = f'layernorm2d_fwd_{dtype_str}_n{self.F_N}'
+            if self.F_xbias != 0:
+                nnn = nnn + '_' + XBIAS_ENUM_STR_MAP[self.F_xbias] 
+            if self.F_add != 0:
+                nnn = nnn + '_' + FUSED_ADD_ENUM_STR_MAP[self.F_add]
+            if self.F_sweep != 0:
+                nnn = nnn + '_' + FUSED_FUSED_SWEEP_STR_MAP[self.F_sweep]
+            return nnn
+
+        @property
+        def instance_name(self) ->str:
+            return self.name
+
+        @property
+        def content(self) ->str:
+            instance_defs = ''
+            for ins in self.instance_list:
+                instance_defs += ins.def_name + '\n'
+            return layernorm_fwd_codegen.INSTANCE_BASE.format(F_instance_def=instance_defs)
+
+    @property
+    def name_api(self) -> str:
+        return 'layernorm2d_fwd_api'
+
+    @property
+    def name_common_header(self) -> str:
+        return 'layernorm2d_fwd_api_common'
+
+    def content_api(self, args) -> str:
+        # 1 sort based on dtype
+        t_dtype_dict = dict()
+        blobs = self.get_blobs(args)
+        for blob in blobs:
+            if blob.F_DataTypePair not in t_dtype_dict:
+                t_dtype_dict[blob.F_DataTypePair] = {}
+            if blob.F_N not in t_dtype_dict[blob.F_DataTypePair]:
+                t_dtype_dict[blob.F_DataTypePair][blob.F_N] = []
+            t_dtype_dict[blob.F_DataTypePair][blob.F_N].append(blob)
+
+        d_str = ''
+        for i_d, dtype_ in enumerate(t_dtype_dict):
+            blob_per_t = t_dtype_dict[dtype_]
+            n_str = ''
+            for i_n, n_ in enumerate(blob_per_t):
+                blob_per_n = blob_per_t[n_]
+                inner_str = ""
+                for i_b, b_ in enumerate(blob_per_n):
+                    # generate single kernel instance file
+                    #vec_str = ""
+                    for i_ins, ins in enumerate(b_.instance_list):
+                        idx_in_n = i_b * len(b_.instance_list) + i_ins
+                        len_in_n = len(blob_per_n) * len(b_.instance_list)
+                        # _if = 'if' if i_ins == 0 else 'else if'
+                        if ins.F_kFusedQuant == 0:
+                            _sweep_cond = 't.fused_quant == {f_fused_sweep}'.format(f_fused_sweep = ins.F_kFusedQuant)
+                        elif ins.F_kFusedQuant == 1:
+                            _sweep_cond = 't.fused_quant == {f_fused_sweep} && (t.prec_sm == \"{f_sx_type}\" && t.prec_sy == \"{f_sy_type}\")'.format(
+                                f_fused_sweep = ins.F_kFusedQuant, f_sx_type=ins.F_SmoothScaleDataType, f_sy_type=ins.F_YScaleDataType)
+                        elif ins.F_kFusedQuant == 2:
+                            _sweep_cond = 't.fused_quant == {f_fused_sweep} && (t.prec_sy == \"{f_sy_type}\")'.format(
+                                f_fused_sweep = ins.F_kFusedQuant, f_sy_type=ins.F_YScaleDataType)
+                        _cond = '((a.n % {f_vec_n} == 0) && (t.xbias == {f_xbias}) && (t.fused_add == {f_fused_add}) && ({f_sweep_cond}))'.format(
+                                        f_vec_n = ins.F_Vector_N, f_xbias = ins.F_kXbias, f_fused_add = ins.F_kFusedAdd,
+                                        f_sweep_cond = _sweep_cond)
+                        inner_str += self.API_INNER_CASE.format(F_if = get_if_str(idx_in_n, len_in_n, False),
+                                            F_VEC_COND = _cond, F_instance_func=ins.call_name)
+                    #inner_str = inner_str + vec_str
+                n_cnd = f'(a.n <= {n_})' if isinstance(n_, int) else ''
+                n_str += self.API_PER_N_CASE.format(F_if = get_if_str(i_n, len(blob_per_t), not isinstance(n_, int)), F_N_COND=n_cnd, F_inner_dispatch=inner_str)
+            prec_i, prec_o = dtype_.split(',')
+            d_str += self.API_PER_DTYPE.format(F_if = get_if_str(i_d, len(t_dtype_dict), False), F_i_type=prec_i, F_o_type=prec_o, F_per_n_case=n_str)
+
+        api_base = self.API_BASE.format(F_traits_define=self.API_TRAITS_DEFINE, F_dispatch=d_str)
+        return api_base
+
+    @property
+    def content_common_header(self) -> str:
+        return self.API_COMMON_HEADER.format(F_traits_define=self.API_TRAITS_DEFINE)
+
+    def get_blobs(self, args):
+        h_traits = layernorm_fwd_codegen.h_traits
+        h_instance = layernorm_fwd_codegen.h_instance
+
+        dynamic_quant_out_dtype = ['int8', 'fp8']
+        # some predefined support range
+        # (prec_i,prec_o) for simplicity this string will be used as key for dict
+        scale_list = [('fp32,fp32')]
+        dtype_list = [('fp16,fp16'), ('bf16,bf16'),
+                        ('fp16,int8'), ('bf16,int8'),
+                        ('fp16,fp8'), ('bf16,fp8')] # NOTE: only fused-dynamic-quant use int8 or fp8 out
+        types_8bit = ('int8', 'fp8')
+        types_16bit = ('int16', 'fp16', 'bf16')
+        #fused_add_list = [0, 1, 2]
+        #fused_sweep_list = [0, 1, 2] # NOTE: only single pass can use fused dynamic quant
+        xbias_list = [0, 1]
+        fused_add_list = [0, 1]
+        fused_sweep_list = [0, 1] # NOTE: only single pass can use fused dynamic quant
+        #                                                       rm  rn  tm   tn  vn  pd     mv     fdiv  welford   2p     xbias    add   sweep
+        h_trait_dict = {'64'  : [ h_traits('x', 'y', 'xs', 'ys', 1,  1,  8,  8,  8,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  16, 4,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  64, 1,  True,  False, True, True,   False,   0,    0,    0)],
+                        '128' : [ h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  16, 8,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  64, 2,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  4,  64, 1,  True,  False, True, True,   False,   0,    0,    0)],
+                        '256' : [ h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  64, 4,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  4,  64, 2,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  4,  64, 1,  True,  False, True, True,   False,   0,    0,    0)],
+                        '512' : [ h_traits('x', 'y', 'xs', 'ys', 1,  1,  4,  64, 8,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  4,  64, 4,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  4,  64, 2,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  8,  4,  64, 1,  True,  False, True, True,   False,   0,    0,    0)],
+                        '768' : [ h_traits('x', 'y', 'xs', 'ys', 1,  3,  4,  64, 4,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  6,  4,  64, 2,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1, 12,  4,  64, 1,  True,  False, True, True,   False,   0,    0,    0)],
+                        '1024' :[ h_traits('x', 'y', 'xs', 'ys', 1,  1,  2, 128, 8,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  2, 128, 4,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  2, 128, 2,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 1,  True,  False, True, True,   False,   0,    0,    0)],
+                        '1536' :[ h_traits('x', 'y', 'xs', 'ys', 1,  3,  4,  64, 8,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  2, 128, 4,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 256, 2,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  6,  1, 256, 1,  True,  False, True, True,   False,   0,    0,    0)],
+                        '2048' :[ h_traits('x', 'y', 'xs', 'ys', 1,  1,  1, 256, 8,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  1, 256, 4,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 2,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  8,  1, 256, 1,  True,  False, True, True,   False,   0,    0,    0)],
+                        '3072' :[ h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 128, 8,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 256, 4,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  6,  1, 256, 2,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1,1024, 1,  True,  False, True, True,   False,   0,    0,    0)],
+                        '4096' :[ h_traits('x', 'y', 'xs', 'ys', 1,  2,  1, 256, 8,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 4,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  2,  1,1024, 2,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1,1024, 1,  True,  False, True, True,   False,   0,    0,    0)],
+                        '6144' :[ h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 256, 8,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1, 512, 4,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  3,  1,1024, 2,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  6,  1,1024, 1,  True,  False, True, True,   False,   0,    0,    0)],
+                        '8192' :[ h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 8,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 512, 4,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1,1024, 2,  True,  False, True, True,   False,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  8,  1,1024, 1,  True,  False, True, True,   False,   0,    0,    0)],
+                        'big'  :[ h_traits('x', 'y', 'xs', 'ys', 1,  1,  1,1024, 8,  True,  False, True, True,    True,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1, 256, 4,  True,  False, True, True,    True,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1, 12,  1, 256, 2,  True,  False, True, True,    True,   0,    0,    0),
+                                  h_traits('x', 'y', 'xs', 'ys', 1,  4,  1,1024, 1,  True,  False, True, True,    True,   0,    0,    0)]}
+        total_blob = list()
+        for hs_key in h_trait_dict:
+            hs = h_trait_dict[hs_key]
+            current_n = hs[0].F_Repeat_N * hs[0].F_ThreadPerBlock_N * hs[0].F_Vector_N
+            for dtype, scale_type, xbias, fused_add, fused_quant in itertools.product(dtype_list, scale_list, xbias_list, fused_add_list, fused_sweep_list):
+                prec_i, prec_o = dtype.split(',')
+                scale_sm, scale_y = scale_type.split(',')
+                if prec_o in dynamic_quant_out_dtype and fused_quant != 1:
+                    continue # skip non dynamic quant case
+                if fused_quant == 1 and hs_key == 'big':
+                    continue
+                current_hs = list()
+                for chs_ in hs:
+                    h_ = copy.copy(chs_) # copy the base instance out
+                    h_.F_XDataType = prec_i
+                    h_.F_YDataType = prec_o
+                    h_.F_SmoothScaleDataType = scale_sm
+                    h_.F_YScaleDataType = scale_y
+                    h_.F_kXbias = xbias
+                    h_.F_kFusedAdd = fused_add
+                    h_.F_kFusedQuant = fused_quant
+                    # disable welford update for 8bit and 16 bit smallN
+                    if not h_.F_kTwoPass_:
+                        #disable 16 bit when set args disable_16b_welford
+                        if args.disable_16b_welford and prec_i in types_16bit:
+                            h_.F_kWelford_ = False
+                        #disable 8bit by default
+                        elif prec_i in types_8bit or prec_o in types_8bit:
+                            h_.F_kWelford_ = False
+                        #disable 16bit small N
+                        elif prec_i in types_16bit and hs_key == '64':
+                            h_.F_kWelford_ = False
+                    current_hs.append(h_) # + "\n"
+                #f.write(str(f.parent / GEN_DIR / (blobs.api_common_header_
+                current_n_str = 'big' if hs_key == 'big' else current_n
+                total_blob.append(h_instance(dtype, current_n_str, xbias, fused_add, fused_quant, current_hs))
+        return total_blob
+
+    def list_blobs(self, args) -> None:
+        w_p = Path(self.working_path)
+        list_p = w_p / 'layernorm2d_fwd_blobs.txt'
+        blobs = self.get_blobs(args)
+        with list_p.open('w') as list_f:
+            # api related file
+            list_f.write(str(w_p / (self.name_api + ".cpp"))  + "\n")
+            list_f.write(str(w_p / (self.name_common_header + ".hpp"))  + "\n")
+            # kernel instance file
+            for b in blobs:
+                list_f.write(str(w_p / (b.name + ".cpp")) + "\n")
+
+    def gen_blobs(self, args) -> None:
+        w_p = Path(self.working_path)
+        w_str = self.content_api(args)
+        (w_p / (self.name_api + ".cpp")).write_text(w_str)
+        (w_p / (self.name_common_header + ".hpp")).write_text(self.content_common_header)
+        blobs = self.get_blobs(args)
+        for b in blobs:
+            (w_p / (b.name + ".cpp")).write_text(b.content)
+
+def list_blobs(args):
+    api_list = args.api.split(',')
+    for api in api_list:
+        if api == 'fwd':
+            layernorm_fwd_codegen(args.working_path, args.filter).list_blobs(args)
+
+
+def gen_blobs(args):
+    api_list = args.api.split(',')
+    for api in api_list:
+        if api == 'fwd':
+            layernorm_fwd_codegen(args.working_path, args.filter).gen_blobs(args)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog="generate",
+        description="gen API for CK layernorm kernel",
+    )
+    parser.add_argument(
+        "-a",
+        "--api",
+        default='fwd[all]',
+        required=False,
+        help="supply API(s) to generate (default: fwd). separated by comma."
+    )
+
+    # the directory for list_blobs/gen_blobs to write files into
+    parser.add_argument(
+        "-w",
+        "--working_path",
+        default="./",
+        required=False,
+        help="the path where all the blobs are going to be generated"
+    )
+
+    # this script have 2 modes
+    # 1) list_blobs mode, will generate a txt file with all the files going to be generated.
+    #    this is useful in build system like cmake to construct source code dependency, by
+    #    reading the content out of this file
+    # 2) gen_blobs mode, will generate the actuall kernel instance and api. If in framework
+    #    like FA, only need to use this mode
+    parser.add_argument(
+        "-l",
+        "--list_blobs",
+        action='store_true',
+        help="list all the kernels to a file, "
+    )
+
+    parser.add_argument(
+        "-g",
+        "--gen_blobs",
+        action='store_true',
+        help="generate all kernels into different tile"
+    )
+
+    # TODO: if using filter, must apply same value to output_dir and list_blobs
+    parser.add_argument(
+        "-f",
+        "--filter",
+        required=False,
+        help="filter out kernels that need to generate, using fnmatch module"
+    )
+
+    parser.add_argument(
+        "-t",
+        "--traits",
+        default="all",
+        required=False,
+        help="enable/disable some feature. default generate all"
+    )
+
+    parser.add_argument(
+        "-r",
+        "--receipt",
+        default=0,
+        required=False,
+        help="codegen receipt."
+    )
+
+    parser.add_argument(
+        "--disable_16b_welford",
+        default=False,
+        required=False,
+        help="enable/disable welford for 16bit datatype n > 64"
+    )
+
+    args = parser.parse_args()
+
+    # print(f'{args.list_blobs}-{args.gen_blobs}')
+    if (args.gen_blobs and args.list_blobs) or ((not args.gen_blobs) and (not args.list_blobs)):
+        print('gen_blobs/list_blobs must specify only one option')
+        sys.exit()
+
+    p = Path(args.working_path)
+    if not p.exists():
+        p.mkdir()
+
+    if args.list_blobs:
+        list_blobs(args)
+    else:
+        gen_blobs(args)
--- a/test/ck_tile/layernorm2d/layernorm2d_fwd.hpp
+++ b/test/ck_tile/layernorm2d/layernorm2d_fwd.hpp
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/layernorm2d.hpp"
+#include <string>
+
+template <typename InType,
+          typename OutType,
+          typename SmoothSScaleDataType_,
+          typename YScaleDataType_>
+struct LayerNormTypeConfig;
+
+template <typename OutType, typename SmoothScaleDataType_, typename YScaleDataType_>
+struct LayerNormTypeConfig<ck_tile::half_t, OutType, SmoothScaleDataType_, YScaleDataType_>
+{
+    using XDataType           = ck_tile::half_t;
+    using YDataType           = OutType;
+    using XBiasDataType       = ck_tile::half_t;
+    using GammaDataType       = ck_tile::half_t;
+    using BetaDataType        = ck_tile::half_t;
+    using MeanDataType        = ck_tile::half_t;
+    using InvStdDataType      = ck_tile::half_t;
+    using ComputeDataType     = float;
+    using SmoothScaleDataType = SmoothScaleDataType_;
+    using YScaleDataType      = YScaleDataType_;
+};
+
+template <typename OutType, typename SmoothScaleDataType_, typename YScaleDataType_>
+struct LayerNormTypeConfig<ck_tile::bf16_t, OutType, SmoothScaleDataType_, YScaleDataType_>
+{
+    using XDataType           = ck_tile::bf16_t;
+    using YDataType           = OutType;
+    using XBiasDataType       = ck_tile::bf16_t;
+    using GammaDataType       = ck_tile::bf16_t;
+    using BetaDataType        = ck_tile::bf16_t;
+    using MeanDataType        = ck_tile::bf16_t;
+    using InvStdDataType      = ck_tile::bf16_t;
+    using ComputeDataType     = float;
+    using SmoothScaleDataType = SmoothScaleDataType_;
+    using YScaleDataType      = YScaleDataType_;
+};
+
+// runtime args
+struct layernorm2d_fwd_args : public ck_tile::Layernorm2dFwdHostArgs
+{
+};
+
+// This is the public API, will be generated by script
+struct layernorm2d_fwd_traits
+{
+    std::string prec_i; // input precision
+    std::string prec_o; // output precision
+
+    // if fused_quant == 1, need set prec_sm/prec_sy to proper string, otherwise can set
+    // arbitrary(will skip check) if fused_quant == 2, need set prec_sy to proper string, otherwise
+    // can set arbitrary(will skip check)
+    std::string prec_sm; // x-scale, used for [1*N] input smooth quant
+    std::string prec_sy; // y-scale, used for [M*1] output for next layer
+
+    bool save_mean_var; //
+    int xbias;          // 0:no-bias, 1:add bias
+    int fused_add;      // 0:no-add, 1:pre-add-store, 2:pre-add
+    int fused_quant;    // 0:no-sweep, 1:smooth-dynamic-quant, 2:dynamic-quant
+};
+
+float layernorm2d_fwd(layernorm2d_fwd_traits, layernorm2d_fwd_args, const ck_tile::stream_config&);
--- a/test/ck_tile/layernorm2d/layernorm2d_fwd.inc
+++ b/test/ck_tile/layernorm2d/layernorm2d_fwd.inc
@@ -0,0 +1,566 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck_tile/host.hpp"
+#include "layernorm2d_fwd.hpp"
+#include <algorithm>
+#include <cstring>
+
+// different threshold for different dtype
+template <typename DataType>
+auto get_elimit()
+{
+    double rtol = 1e-2;
+    double atol = 1e-2;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<ck_tile::bf16_t>()
+{
+    double rtol = 1e-2;
+    double atol = 1e-2;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+template <>
+auto get_elimit<ck_tile::int8_t>()
+{
+    double rtol = 1e-2;
+    double atol = 1.0;
+    return ck_tile::make_tuple(rtol, atol);
+}
+
+auto create_args(int argc, char* argv[])
+{
+    ck_tile::ArgParser arg_parser;
+    arg_parser.insert("m", "3328", "m dimension")
+        .insert("n", "4096", "n dimension")
+        .insert("x_stride", "-1", "x row_stride, if -1 then equal to n")
+        .insert("xr_stride", "-1", "x residule row_stride, if -1 then equal to n")
+        .insert("y_stride", "-1", "y row_stride, if -1 then equal to n")
+        .insert("yr_stride", "-1", "y residule row_stride, if -1 then equal to n")
+        .insert("e", "1e-5", "epsilon")
+        .insert("save_mv", "0", "save mean/variance(invstd) or not. set to 1 in training case")
+        .insert("v", "1", "cpu validation or not")
+        .insert("kname", "1", "print kernel name or not")
+        .insert("prec_i", "fp16", "input precision")
+        .insert("prec_o", "auto", "output precision, set auto will be the same as input")
+        .insert("prec_sm",
+                "auto",
+                "output quant scale type, set auto will use fp32. used when fquant=1")
+        .insert("prec_sy",
+                "auto",
+                "output quant scale type, set auto will use fp32. used when fquant=1 or 2")
+        .insert("xbias", "0", "add bias, 0:no add, 1:add bias before fadd")
+        .insert("fadd", "0", "fused-add, 0:no fused add, 1:preadd+store, 2:preadd only")
+        .insert("fquant", "0", "fused-quant, 0:no, 1:smooth-dynamic-quant, 2:dynamic-quant")
+        .insert("warmup", "5", "cold iter")
+        .insert("repeat", "20", "hot iter");
+
+    bool result = arg_parser.parse(argc, argv);
+    return std::make_tuple(result, arg_parser);
+}
+
+template <typename InDataType,
+          typename OutDataType,
+          typename SmoothScaleDataType,
+          typename YScaleDataType,
+          bool SaveMeanVar>
+bool run(const ck_tile::ArgParser& arg_parser)
+{
+    ck_tile::index_t m        = arg_parser.get_int("m");
+    ck_tile::index_t n        = arg_parser.get_int("n");
+    ck_tile::index_t x_stride = arg_parser.get_int("x_stride");
+    if(x_stride < 0)
+        x_stride = n;
+    ck_tile::index_t xr_stride = arg_parser.get_int("xr_stride");
+    if(xr_stride < 0)
+        xr_stride = n;
+    ck_tile::index_t y_stride = arg_parser.get_int("y_stride");
+    if(y_stride < 0)
+        y_stride = n;
+    ck_tile::index_t yr_stride = arg_parser.get_int("yr_stride");
+    if(yr_stride < 0)
+        yr_stride = n;
+    float epsilon       = arg_parser.get_float("e");
+    std::string prec_i  = arg_parser.get_str("prec_i");
+    std::string prec_o  = arg_parser.get_str("prec_o");
+    std::string prec_sm = arg_parser.get_str("prec_sm");
+    std::string prec_sy = arg_parser.get_str("prec_sy");
+    if(prec_o == "auto")
+    {
+        prec_o = prec_i;
+    }
+    if(prec_sm == "auto")
+    {
+        prec_sm = "fp32";
+    }
+    if(prec_sy == "auto")
+    {
+        prec_sy = "fp32";
+    }
+
+    int kname         = arg_parser.get_int("kname");
+    int do_validation = arg_parser.get_int("v");
+    int warmup        = arg_parser.get_int("warmup");
+    int repeat        = arg_parser.get_int("repeat");
+    int xbias         = arg_parser.get_int("xbias");
+    int fused_add     = arg_parser.get_int("fadd");
+    int fused_quant   = arg_parser.get_int("fquant");
+    if(fused_quant == 1 && prec_o != "int8" && prec_o != "fp8")
+    {
+        std::cout
+            << "if fused_quant is 1 or 2, only support \"-prec_o=int8\" or \"-prec_o=fp8\" cases."
+            << std::endl;
+        return false;
+    }
+
+    assert(x_stride >= n);
+
+    using TypeConfig =
+        LayerNormTypeConfig<InDataType, OutDataType, SmoothScaleDataType, YScaleDataType>;
+
+    using XDataType         = typename TypeConfig::XDataType;
+    using YDataType         = typename TypeConfig::YDataType;
+    using XBiasDataType     = typename TypeConfig::XBiasDataType;
+    using GammaDataType     = typename TypeConfig::GammaDataType;
+    using BetaDataType      = typename TypeConfig::BetaDataType;
+    using XResidualDataType = XDataType;
+    using YResidualDataType = XDataType;
+
+    using MeanDataType =
+        std::conditional_t<SaveMeanVar, typename TypeConfig::MeanDataType, ck_tile::null_type>;
+    using InvStdDataType =
+        std::conditional_t<SaveMeanVar, typename TypeConfig::InvStdDataType, ck_tile::null_type>;
+
+    using ComputeDataType = typename TypeConfig::ComputeDataType;
+
+    // host verify
+    ck_tile::HostTensor<XDataType> x_host({m, n}, {x_stride, 1});
+    ck_tile::HostTensor<XBiasDataType> x_bias_host({n});
+    ck_tile::HostTensor<GammaDataType> gamma_host({n});
+    ck_tile::HostTensor<BetaDataType> beta_host({n});
+
+    ck_tile::HostTensor<XResidualDataType> x_residual_host({m, n}, {xr_stride, 1});
+    ck_tile::HostTensor<YResidualDataType> y_residual_host({m, n}, {yr_stride, 1});
+
+    ck_tile::HostTensor<YDataType> y_host_ref({m, n}, {y_stride, 1});
+    ck_tile::HostTensor<YDataType> y_host_dev({m, n}, {y_stride, 1});
+
+    ck_tile::HostTensor<MeanDataType> mean_host_ref({m});
+    ck_tile::HostTensor<InvStdDataType> invStd_host_ref({m});
+    ck_tile::HostTensor<YScaleDataType> y_scale_host_ref({m});
+    ck_tile::HostTensor<YScaleDataType> y_scale_host_dev({m});
+
+    ck_tile::HostTensor<SmoothScaleDataType> sm_scale_host({n});
+    ck_tile::HostTensor<SmoothScaleDataType> sm_scale_host_dev({n});
+
+    ck_tile::FillUniformDistribution<XDataType>{-.5f, .5f}(x_host);
+    ck_tile::FillUniformDistribution<XResidualDataType>{-.5f, .5f}(x_residual_host);
+    ck_tile::FillUniformDistribution<SmoothScaleDataType>{-1.f, 1.f}(sm_scale_host);
+    ck_tile::FillUniformDistribution<XBiasDataType>{-.5f, .5f}(x_bias_host);
+    ck_tile::FillUniformDistribution<GammaDataType>{-.5f, .5f}(gamma_host);
+    ck_tile::FillUniformDistribution<BetaDataType>{-.5f, .5f}(beta_host);
+
+    ck_tile::DeviceMem x_buf(x_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem x_bias_buf(x_bias_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem gamma_buf(gamma_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem beta_buf(beta_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem y_buf(y_host_dev.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem y_scale_buf(y_scale_host_dev.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem sm_scale_buf(sm_scale_host_dev.get_element_space_size_in_bytes());
+
+    ck_tile::DeviceMem x_residual_buf(x_residual_host.get_element_space_size_in_bytes());
+    ck_tile::DeviceMem y_residual_buf(y_residual_host.get_element_space_size_in_bytes());
+
+    x_buf.ToDevice(x_host.data());
+    x_bias_buf.ToDevice(x_bias_host.data());
+    gamma_buf.ToDevice(gamma_host.data());
+    beta_buf.ToDevice(beta_host.data());
+    x_residual_buf.ToDevice(x_residual_host.data());
+    sm_scale_buf.ToDevice(sm_scale_host.data());
+
+    auto prec_str = [&]() {
+        auto base_str = prec_i;
+        if(prec_i != prec_o)
+        {
+            base_str += "|" + prec_o;
+        }
+        if(fused_quant == 1)
+        {
+            base_str += std::string("(") + prec_sy + ")";
+        }
+        return base_str;
+    }();
+
+    std::cout << "[" << prec_str << "]"
+              << " m:" << m << ", n:" << n << ", x_stride:" << x_stride
+              << ", xr_stride:" << xr_stride << ", y_stride:" << y_stride
+              << ", yr_stride:" << yr_stride << std::flush;
+
+    layernorm2d_fwd_traits traits{
+        prec_i, prec_o, prec_sm, prec_sy, SaveMeanVar, xbias, fused_add, fused_quant};
+
+    layernorm2d_fwd_args args{x_buf.GetDeviceBuffer(),
+                              fused_add != 0 ? x_residual_buf.GetDeviceBuffer() : nullptr,
+                              fused_quant == 1 ? sm_scale_buf.GetDeviceBuffer() : nullptr,
+                              x_bias_buf.GetDeviceBuffer(),
+                              gamma_buf.GetDeviceBuffer(),
+                              beta_buf.GetDeviceBuffer(),
+
+                              y_buf.GetDeviceBuffer(),
+                              fused_add == 1 ? y_residual_buf.GetDeviceBuffer() : nullptr,
+                              fused_quant != 0 ? y_scale_buf.GetDeviceBuffer() : nullptr,
+                              nullptr, // p_mean, unsupported yet
+                              nullptr, // p_invStd, unsupported yet
+
+                              epsilon,
+                              m,
+                              n,
+                              x_stride,   // x row_stride
+                              xr_stride,  // x residule row stride
+                              y_stride,   // y row stride
+                              yr_stride}; // y residule row stride
+
+    float ave_time = layernorm2d_fwd(
+        traits, args, ck_tile::stream_config{nullptr, true, kname ? 1 : 0, warmup, repeat});
+
+    if(ave_time < 0)
+    {
+        std::cout << " not supported!" << std::endl << std::flush;
+        return false;
+    }
+
+    std::size_t num_byte = sizeof(XDataType) * m * n + sizeof(XBiasDataType) * n +
+                           sizeof(GammaDataType) * n + sizeof(BetaDataType) * n +
+                           sizeof(YDataType) * m * n;
+
+    float gb_per_sec = num_byte / 1.E6 / ave_time;
+    std::cout << ", " << ave_time * 1.E3 << " us, " << gb_per_sec << " GB/s" << std::flush;
+
+    bool pass = true;
+
+    if(do_validation)
+    {
+        // reference
+        if(xbias != 0)
+        {
+            // add bias before fadd
+            int M = x_host.mDesc.get_lengths()[0];
+            int N = x_host.mDesc.get_lengths()[1];
+            for(int idx_m = 0; idx_m < M; ++idx_m)
+            {
+                for(int idx_n = 0; idx_n < N; ++idx_n)
+                {
+                    x_host(idx_m, idx_n) = ck_tile::type_convert<XDataType>(
+                        ck_tile::type_convert<ComputeDataType>(x_host(idx_m, idx_n)) +
+                        ck_tile::type_convert<ComputeDataType>(x_bias_host(idx_n)));
+                }
+            }
+        }
+
+        if(fused_add != 0)
+        {
+            // fused pre_add/pre_add_store
+            // TODO we accumulate directly to x_host for simplcity here...
+
+            std::transform(x_host.mData.cbegin(),
+                           x_host.mData.cend(),
+                           x_residual_host.mData.cbegin(),
+                           x_host.mData.begin(),
+                           [](auto x_, auto r_) {
+                               auto o_ = ck_tile::type_convert<ComputeDataType>(x_) +
+                                         ck_tile::type_convert<ComputeDataType>(r_);
+                               return ck_tile::type_convert<XDataType>(o_);
+                           });
+        }
+        ck_tile::reference_layernorm2d_fwd<XDataType,
+                                           GammaDataType,
+                                           BetaDataType,
+                                           ComputeDataType,
+                                           YDataType,
+                                           MeanDataType,
+                                           InvStdDataType>(
+            x_host, gamma_host, beta_host, y_host_ref, mean_host_ref, invStd_host_ref, epsilon);
+
+        if(fused_quant != 0)
+        {
+            auto dquant_functor = [&](int m_, auto& o_, auto& acc_) {
+                int N_ = acc_.mDesc.get_lengths()[1];
+                if(fused_quant == 1)
+                {
+                    for(int n_ = 0; n_ < N_; n_++)
+                    {
+                        // input smooth outlier
+                        acc_(m_, n_) = acc_(m_, n_) *
+                                       ck_tile::type_convert<ComputeDataType>(sm_scale_host(n_));
+                    }
+                }
+                ComputeDataType absmax = static_cast<ComputeDataType>(0);
+                for(int n_ = 0; n_ < N_; n_++)
+                {
+                    const auto a = ck_tile::abs(acc_(m_, n_));
+                    absmax       = a > absmax ? a : absmax;
+                }
+                // printf("cpu:absmax:%f\n", absmax);
+                constexpr ComputeDataType kMaxY =
+                    std::is_same<YDataType, ck_tile::fp8_t>::value    ? 240.0
+                    : std::is_same<YDataType, ck_tile::int8_t>::value ? 127.0
+                                                                      : 0.0;
+                ComputeDataType y_scale = absmax / kMaxY;
+                y_scale_host_ref(m_)    = ck_tile::type_convert<YScaleDataType>(y_scale);
+                for(int n_ = 0; n_ < N_; n_++)
+                {
+                    o_(m_, n_) = ck_tile::type_convert<YDataType>(acc_(m_, n_) / y_scale);
+                }
+            };
+
+            ck_tile::reference_layernorm2d_fwd<XDataType,
+                                               GammaDataType,
+                                               BetaDataType,
+                                               ComputeDataType,
+                                               YDataType,
+                                               MeanDataType,
+                                               InvStdDataType>(x_host,
+                                                               gamma_host,
+                                                               beta_host,
+                                                               y_host_ref,
+                                                               mean_host_ref,
+                                                               invStd_host_ref,
+                                                               epsilon,
+                                                               dquant_functor);
+        }
+        else
+        {
+            ck_tile::reference_layernorm2d_fwd<XDataType,
+                                               GammaDataType,
+                                               BetaDataType,
+                                               ComputeDataType,
+                                               YDataType,
+                                               MeanDataType,
+                                               InvStdDataType>(
+                x_host, gamma_host, beta_host, y_host_ref, mean_host_ref, invStd_host_ref, epsilon);
+        }
+
+        y_buf.FromDevice(y_host_dev.data());
+
+        ck_tile::HostTensor<YResidualDataType> y_residual_host_dev({m, n}, {yr_stride, 1});
+        if(fused_add == 1)
+        {
+            y_residual_buf.FromDevice(y_residual_host_dev.data());
+        }
+
+        auto [rtol, atol] = get_elimit<OutDataType>();
+
+        if(x_stride == n)
+        {
+            pass = ck_tile::check_err(
+                y_host_dev, y_host_ref, std::string("OUT Error: Incorrect results!"), rtol, atol);
+            if(fused_add == 1)
+            {
+                pass &= ck_tile::check_err(y_residual_host_dev,
+                                           x_host,
+                                           std::string("ADD Error: Incorrect results!"),
+                                           rtol,
+                                           atol);
+            }
+        }
+        else
+        {
+            for(int i_r = 0; i_r < m; i_r++)
+            {
+                std::vector<YDataType> y_host_dev_row(y_host_dev.begin() + i_r * y_stride,
+                                                      y_host_dev.begin() + i_r * y_stride + n);
+                std::vector<YDataType> y_host_ref_row(y_host_ref.begin() + i_r * y_stride,
+                                                      y_host_ref.begin() + i_r * y_stride + n);
+                pass &= ck_tile::check_err(y_host_dev_row,
+                                           y_host_ref_row,
+                                           std::string("OUT[") + std::to_string(i_r) +
+                                               std::string("] Error: Incorrect results!"),
+                                           rtol,
+                                           atol);
+                if(fused_add == 1)
+                {
+                    std::vector<YResidualDataType> y_residual_host_dev_row(
+                        y_residual_host_dev.begin() + i_r * yr_stride,
+                        y_residual_host_dev.begin() + i_r * yr_stride + n);
+                    std::vector<YResidualDataType> y_residual_host_ref_row(
+                        x_host.begin() + i_r * yr_stride, x_host.begin() + i_r * yr_stride + n);
+                    pass &= ck_tile::check_err(y_residual_host_dev_row,
+                                               y_residual_host_ref_row,
+                                               std::string("ADD[") + std::to_string(i_r) +
+                                                   std::string("] Error: Incorrect results!"),
+                                               rtol,
+                                               atol);
+                }
+            }
+        }
+        if(fused_quant == 1)
+        {
+            y_scale_buf.FromDevice(y_scale_host_dev.data());
+            pass &= ck_tile::check_err(y_scale_host_dev,
+                                       y_scale_host_ref,
+                                       std::string("SCALE Error: Incorrect results!"),
+                                       rtol,
+                                       atol);
+        }
+
+        std::cout << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl;
+    }
+
+    return pass;
+}
+
+bool dispatch_by_type(int argc, char* argv[])
+{
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return false;
+
+    std::string prec_i  = arg_parser.get_str("prec_i");
+    std::string prec_o  = arg_parser.get_str("prec_o");
+    std::string prec_sm = arg_parser.get_str("prec_sm");
+    std::string prec_sy = arg_parser.get_str("prec_sy");
+
+    if(prec_o == "auto")
+    {
+        prec_o = prec_i;
+    }
+    if(prec_sm == "auto")
+    {
+        prec_sm = "fp32";
+    }
+    if(prec_sy == "auto")
+    {
+        prec_sy = "fp32";
+    }
+    int save_mv = arg_parser.get_int("save_mv");
+
+    // no dynamic quant case
+    if(prec_i == "fp16" && prec_o == "fp16" && prec_sm == "fp32" && prec_sy == "fp32" && save_mv)
+    {
+        return run<ck_tile::half_t, ck_tile::half_t, float, float, true>(arg_parser) ? 0 : -2;
+    }
+    else if(prec_i == "fp16" && prec_o == "fp16" && prec_sm == "fp32" && prec_sy == "fp32" &&
+            !save_mv)
+    {
+        return run<ck_tile::half_t, ck_tile::half_t, float, float, false>(arg_parser) ? 0 : -2;
+    }
+    else if(prec_i == "bf16" && prec_o == "bf16" && prec_sm == "fp32" && prec_sy == "fp32" &&
+            save_mv)
+    {
+        return run<ck_tile::bf16_t, ck_tile::bf16_t, float, float, true>(arg_parser) ? 0 : -2;
+    }
+    else if(prec_i == "bf16" && prec_o == "bf16" && prec_sm == "fp32" && prec_sy == "fp32" &&
+            !save_mv)
+    {
+        return run<ck_tile::bf16_t, ck_tile::bf16_t, float, float, true>(arg_parser) ? 0 : -2;
+    }
+
+    // dynamic quant case, only in inference
+    else if(prec_i == "fp16" && prec_o == "int8" && prec_sm == "fp32" && prec_sy == "fp32" &&
+            !save_mv)
+    {
+        return run<ck_tile::half_t, ck_tile::int8_t, float, float, false>(arg_parser) ? 0 : -2;
+    }
+    else if(prec_i == "bf16" && prec_o == "int8" && prec_sm == "fp32" && prec_sy == "fp32" &&
+            !save_mv)
+    {
+        return run<ck_tile::bf16_t, ck_tile::int8_t, float, float, false>(arg_parser) ? 0 : -2;
+    }
+    else if(prec_i == "fp16" && prec_o == "fp8" && prec_sm == "fp32" && prec_sy == "fp32" &&
+            !save_mv)
+    {
+        return run<ck_tile::half_t, ck_tile::fp8_t, float, float, false>(arg_parser) ? 0 : -2;
+    }
+    else if(prec_i == "bf16" && prec_o == "fp8" && prec_sm == "fp32" && prec_sy == "fp32" &&
+            !save_mv)
+    {
+        return run<ck_tile::bf16_t, ck_tile::fp8_t, float, float, false>(arg_parser) ? 0 : -2;
+    }
+
+    return false;
+}
+
+int run_layernorm2d_fwd_combinations(std::string const& data_type)
+{
+    constexpr size_t PARAM_COUNT = 20;
+    char bufs[PARAM_COUNT][64];
+    char* argv[PARAM_COUNT];
+
+    for(std::size_t i = 0; i < PARAM_COUNT; i++)
+    {
+        argv[i] = bufs[i];
+    }
+
+    std::vector<std::vector<std::string>> fquant = {
+        {}, {"-fquant=1", "-prec_o=int8"}, {"-fquant=1", "-prec_o=fp8"}};
+
+    std::vector<std::string> fadd = {"-fadd=0", "-fadd=1"};
+
+    std::vector<std::vector<std::string>> params = {
+        {"-m=99", "-n=13"},
+        {"-m=17", "-n=16"},
+        {"-m=1", "-n=100"},
+        {"-m=4", "-n=128"},
+        {"-m=80", "-n=127"},
+        {"-m=22", "-n=255 -stride=256"},
+        {"-m=7", "-n=599"},
+        {"-m=19", "-n=512"},
+        {"-m=33", "-n=313 -stride=1000"},
+        {"-m=11", "-n=510"},
+        {"-m=171", "-n=676 -stride=818"},
+        {"-m=91", "-n=636"},
+        {"-m=12", "-n=768 -stride=800"},
+        {"-m=100", "-n=766 -stride=812"},
+        {"-m=31", "-n=1024"},
+        {"-m=64", "-n=1000 -stride=1004"},
+        {"-m=8", "-n=1501"},
+        {"-m=3", "-n=1826"},
+        {"-m=5", "-n=2040"},
+        {"-m=7", "-n=2734"},
+        {"-m=1", "-n=3182"},
+        {"-m=9", "-n=4096"},
+        {"-m=3", "-n=8192"},
+        {"-m=3", "-n=9120"},
+        {"-m=1", "-n=10547"},
+    };
+
+    bool result = true;
+    int argc    = 0;
+    std::vector<int> argc_stack;
+    std::string pr_i = "-prec_i=" + data_type;
+    strncpy(bufs[argc++], "layernorm2d_fwd", 64);
+    strncpy(bufs[argc++], pr_i.c_str(), 64);
+    argc_stack.push_back(argc);
+    for(size_t fquant_idx = 0; fquant_idx < fquant.size(); fquant_idx++)
+    {
+        argc = argc_stack.back();
+        for(size_t j = 0; j < fquant[fquant_idx].size(); j++)
+        {
+            strncpy(bufs[argc++], fquant[fquant_idx][j].c_str(), 64);
+        }
+        argc_stack.push_back(argc);
+        for(size_t fadd_idx = 0; fadd_idx < fadd.size(); fadd_idx++)
+        {
+            argc = argc_stack.back();
+            strncpy(bufs[argc++], fadd[fadd_idx].c_str(), 64);
+            argc_stack.push_back(argc);
+            for(size_t param_idx = 0; param_idx < params.size(); param_idx++)
+            {
+                argc = argc_stack.back();
+                for(size_t j = 0; j < params[param_idx].size(); j++)
+                {
+                    strncpy(bufs[argc++], params[param_idx][j].c_str(), 64);
+                }
+
+                result = dispatch_by_type(argc, argv) && result;
+            }
+            argc_stack.pop_back();
+        }
+        argc_stack.pop_back();
+    }
+    argc_stack.pop_back();
+    return result ? 0 : -1;
+}
--- a/test/ck_tile/layernorm2d/layernorm2d_fwd_bf16.cpp
+++ b/test/ck_tile/layernorm2d/layernorm2d_fwd_bf16.cpp
@@ -0,0 +1,6 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "layernorm2d_fwd.inc"
+
+int main() { return run_layernorm2d_fwd_combinations("bf16"); }
--- a/test/ck_tile/layernorm2d/layernorm2d_fwd_fp16.cpp
+++ b/test/ck_tile/layernorm2d/layernorm2d_fwd_fp16.cpp
@@ -0,0 +1,6 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "layernorm2d_fwd.inc"
+
+int main() { return run_layernorm2d_fwd_combinations("fp16"); }