[CK Tile] enable building examples by default (#3259)

* remove EXCLUDE_FROM_ALL from ck-tile examples -> +15 min build time w/ 64 threads for a single arch * fix cpp17 compile error in the ck-tile examples --------- Co-authored-by: khuagarw <khuagarw@amd.com> Co-authored-by: Ding, Yi <yi.ding@amd.com>
2026-05-05 14:11:29 +00:00 · 2025-11-26 16:24:44 -08:00
parent 40d7217ac7
commit 79aae7c7f7
39 changed files with 175 additions and 174 deletions
--- a/example/ck_tile/01_fmha/CMakeLists.txt
+++ b/example/ck_tile/01_fmha/CMakeLists.txt
@@ -109,6 +109,7 @@ set(FMHA_FWD_INSTANCES "tile_fmha_fwd_instances")
 set(FMHA_BWD_INSTANCES "tile_fmha_bwd_instances")

 message(DEBUG "adding instances ${FMHA_FWD_INSTANCES}")
+# to save build time, exclude the target from "all" target of "01_fmha" directory and its ancestors
 add_library(${FMHA_FWD_INSTANCES} OBJECT EXCLUDE_FROM_ALL)
 target_include_directories(${FMHA_FWD_INSTANCES} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
 target_sources(${FMHA_FWD_INSTANCES} PRIVATE ${FMHA_FWD_GEN_BLOBS})
--- a/example/ck_tile/02_layernorm2d/CMakeLists.txt
+++ b/example/ck_tile/02_layernorm2d/CMakeLists.txt
@@ -26,7 +26,7 @@ add_custom_command(
 set(EXAMPLE_LAYERNORM2D_FWD "tile_example_layernorm2d_fwd")

 message(DEBUG "adding example ${EXAMPLE_LAYERNORM2D_FWD}")
-add_executable(${EXAMPLE_LAYERNORM2D_FWD} EXCLUDE_FROM_ALL layernorm2d_fwd.cpp)
+add_executable(${EXAMPLE_LAYERNORM2D_FWD} layernorm2d_fwd.cpp)
 target_include_directories(${EXAMPLE_LAYERNORM2D_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
 target_sources(${EXAMPLE_LAYERNORM2D_FWD} PRIVATE ${LAYERNORM2D_FWD_GEN_BLOBS})

--- a/example/ck_tile/03_gemm/CMakeLists.txt
+++ b/example/ck_tile/03_gemm/CMakeLists.txt
@@ -1,20 +1,22 @@
-add_executable(tile_example_gemm_basic EXCLUDE_FROM_ALL gemm_basic.cpp)
-add_executable(tile_example_gemm_universal EXCLUDE_FROM_ALL universal_gemm.cpp)
-add_executable(tile_example_gemm_weight_preshuffle EXCLUDE_FROM_ALL gemm_weight_preshuffle.cpp)
-add_executable(tile_example_gemm_reduce EXCLUDE_FROM_ALL gemm_splitk_two_stage_reduce.cpp)
-add_executable(tile_example_gemm_splitk_two_stage EXCLUDE_FROM_ALL gemm_splitk_two_stage.cpp)
-set(EXAMPLE_GEMM_COMPILE_OPTIONS)
-set(EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS)
-if(CK_USE_OCP_FP8)
-  list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
+if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx90a")
+  add_executable(tile_example_gemm_basic gemm_basic.cpp)
+  add_executable(tile_example_gemm_universal universal_gemm.cpp)
+  add_executable(tile_example_gemm_weight_preshuffle gemm_weight_preshuffle.cpp)
+  add_executable(tile_example_gemm_reduce gemm_splitk_two_stage_reduce.cpp)
+  add_executable(tile_example_gemm_splitk_two_stage gemm_splitk_two_stage.cpp)
+  set(EXAMPLE_GEMM_COMPILE_OPTIONS)
+  set(EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS)
+  if(CK_USE_OCP_FP8)
+    list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
+  endif()
+  list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -mllvm -enable-noalias-to-md-conversion=0)
+  list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS -Wno-unused-local-typedef)
+  list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS -Wno-gnu-line-marker)
+  list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS --save-temps)
+  list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm -enable-noalias-to-md-conversion=0")
+  target_compile_options(tile_example_gemm_basic PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+  target_compile_options(tile_example_gemm_universal PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+  target_compile_options(tile_example_gemm_weight_preshuffle PRIVATE ${EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS})
+  target_compile_options(tile_example_gemm_reduce PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+  target_compile_options(tile_example_gemm_splitk_two_stage PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
 endif()
-list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -mllvm -enable-noalias-to-md-conversion=0)
-list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS -Wno-unused-local-typedef)
-list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS -Wno-gnu-line-marker)
-list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS --save-temps)
-list(APPEND EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS "SHELL: -mllvm -greedy-reverse-local-assignment=1 -mllvm -enable-noalias-to-md-conversion=0")
-target_compile_options(tile_example_gemm_basic PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
-target_compile_options(tile_example_gemm_universal PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
-target_compile_options(tile_example_gemm_weight_preshuffle PRIVATE ${EXAMPLE_WEIGHT_PRESHUFFLE_COMPILE_OPTIONS})
-target_compile_options(tile_example_gemm_reduce PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
-target_compile_options(tile_example_gemm_splitk_two_stage PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
--- a/example/ck_tile/03_gemm/gemm_splitk_two_stage_reduce.cpp
+++ b/example/ck_tile/03_gemm/gemm_splitk_two_stage_reduce.cpp
@@ -683,7 +683,7 @@ int run_gemm_example_with_layouts_two_stage(ck_tile::ArgParser& arg_parser,

    if constexpr(preshuffle)
    {
-        ck_tile::HostTensor<BDataType> b_shuffle_host = shuffle_b<GemmConfig>(b_k_n);
+        ck_tile::HostTensor<BDataType> b_shuffle_host = ck_tile::shuffle_b<GemmConfig>(b_k_n);
        // shuffled buffer B for device implementation
        b_k_n_dev_buf.ToDevice(b_shuffle_host.data());
    }
--- a/example/ck_tile/03_gemm/run_gemm_example.inc
+++ b/example/ck_tile/03_gemm/run_gemm_example.inc
@@ -284,12 +284,12 @@ int run_gemm_example_with_layouts(ck_tile::ArgParser& arg_parser,
            if constexpr(GemmConfig::TiledMMAPermuteN)
            {
                std::cout << "Run with PermuteN" << std::endl;
-                return shuffle_b_permuteN<GemmConfig>(b_k_n);
+                return ck_tile::shuffle_b_permuteN<GemmConfig>(b_k_n);
            }
            else
            {
                std::cout << "Run without PermuteN" << std::endl;
-                return shuffle_b<GemmConfig>(b_k_n);
+                return ck_tile::shuffle_b<GemmConfig>(b_k_n);
            }
        }();
        // shuffled buffer B for device implementation
--- a/example/ck_tile/04_img2col/CMakeLists.txt
+++ b/example/ck_tile/04_img2col/CMakeLists.txt
@@ -1,3 +1,3 @@
 # not using add_example_executable() to add this target, since we don't want this to have
 # to be included in "make all/install/check"
-add_executable(tile_example_img2col EXCLUDE_FROM_ALL image_to_column.cpp)
+add_executable(tile_example_img2col image_to_column.cpp)
--- a/example/ck_tile/05_reduce/CMakeLists.txt
+++ b/example/ck_tile/05_reduce/CMakeLists.txt
@@ -3,7 +3,7 @@ set(EXAMPLE_REDUCE "tile_example_reduce")
 # to be included in "make all/install/check"
 message(DEBUG "adding example ${EXAMPLE_REDUCE}")

-add_executable(${EXAMPLE_REDUCE} EXCLUDE_FROM_ALL reduce.cpp)
+add_executable(${EXAMPLE_REDUCE} reduce.cpp)
 target_include_directories(${EXAMPLE_REDUCE} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
 set(EXAMPLE_REDUCE_COMPILE_OPTIONS)

@@ -16,4 +16,4 @@ target_compile_options(${EXAMPLE_REDUCE} PRIVATE ${EXAMPLE_REDUCE_COMPILE_OPTION
 # by cmake will print too many files, execvp: /bin/sh: Argument list too long
 # however, this property may affect global
 # TODO: consider codegen a makefile by us
-set_property(GLOBAL PROPERTY RULE_MESSAGES OFF)
+set_property(GLOBAL PROPERTY RULE_MESSAGES OFF)
--- a/example/ck_tile/06_permute/CMakeLists.txt
+++ b/example/ck_tile/06_permute/CMakeLists.txt
@@ -1,6 +1,6 @@
 # not using add_example_executable() to add this target, since we don't want this to have
 # to be included in "make all/install/check"
-add_executable(tile_example_permute EXCLUDE_FROM_ALL permute.cpp)
+add_executable(tile_example_permute permute.cpp)

 if(NOT DEFINED PERMUTE_USE_ALTERNATIVE_IMPL)
 # set(PERMUTE_USE_ALTERNATIVE_IMPL false)
--- a/example/ck_tile/09_topk_softmax/CMakeLists.txt
+++ b/example/ck_tile/09_topk_softmax/CMakeLists.txt
@@ -1,4 +1,4 @@
-add_executable(tile_example_topk_softmax EXCLUDE_FROM_ALL topk_softmax.cpp topk_softmax_api.cpp)
+add_executable(tile_example_topk_softmax topk_softmax.cpp topk_softmax_api.cpp)
 target_include_directories(tile_example_topk_softmax PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/)

 set(EXAMPLE_TOPK_SOFTMAX_COMPILE_OPTIONS)
--- a/example/ck_tile/10_rmsnorm2d/CMakeLists.txt
+++ b/example/ck_tile/10_rmsnorm2d/CMakeLists.txt
@@ -26,7 +26,7 @@ add_custom_command(
 set(TILE_RMSNORM2D_FWD "tile_rmsnorm2d_fwd")

 message(DEBUG "adding ${TILE_RMSNORM2D_FWD}")
-add_executable(${TILE_RMSNORM2D_FWD} EXCLUDE_FROM_ALL rmsnorm2d_fwd.cpp)
+add_executable(${TILE_RMSNORM2D_FWD} rmsnorm2d_fwd.cpp)
 target_include_directories(${TILE_RMSNORM2D_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
 target_sources(${TILE_RMSNORM2D_FWD} PRIVATE ${RMSNORM2D_FWD_GEN_BLOBS})

@@ -38,7 +38,7 @@ list(APPEND TILE_RMSNORM2D_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -Wno
 target_compile_options(${TILE_RMSNORM2D_FWD} PRIVATE ${TILE_RMSNORM2D_FWD_COMPILE_OPTIONS})

 set(EXAMPLE_RMSNORM2D_FWD "tile_example_rmsnorm2d_fwd")
-add_executable(${EXAMPLE_RMSNORM2D_FWD} EXCLUDE_FROM_ALL example_rmsnorm2d_fwd.cpp)
+add_executable(${EXAMPLE_RMSNORM2D_FWD} example_rmsnorm2d_fwd.cpp)
 target_compile_options(${EXAMPLE_RMSNORM2D_FWD} PRIVATE ${TILE_RMSNORM2D_FWD_COMPILE_OPTIONS})

 # TODO: we have to turn off this global prop, otherwise the progress bar generated
--- a/example/ck_tile/11_add_rmsnorm2d_rdquant/CMakeLists.txt
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/CMakeLists.txt
@@ -3,7 +3,7 @@ set(TILE_ADD_RMSNORM2D_RDQUANT_FWD "tile_add_rmsnorm2d_rdquant_fwd")
 # to be included in "make all/install/check"
 message(DEBUG "adding ${TILE_ADD_RMSNORM2D_RDQUANT_FWD}")
 file(GLOB INSTANCE_SRCS instances/*.cpp)
-add_executable(${TILE_ADD_RMSNORM2D_RDQUANT_FWD} EXCLUDE_FROM_ALL add_rmsnorm2d_rdquant_fwd.cpp)
+add_executable(${TILE_ADD_RMSNORM2D_RDQUANT_FWD} add_rmsnorm2d_rdquant_fwd.cpp)
 target_include_directories(${TILE_ADD_RMSNORM2D_RDQUANT_FWD} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
 target_sources(${TILE_ADD_RMSNORM2D_RDQUANT_FWD} PRIVATE ${INSTANCE_SRCS})

@@ -15,7 +15,7 @@ list(APPEND TILE_ADD_RMSNORM2D_RDQUANT_FWD_COMPILE_OPTIONS -Wno-undefined-func-t
 target_compile_options(${TILE_ADD_RMSNORM2D_RDQUANT_FWD} PRIVATE ${TILE_ADD_RMSNORM2D_RDQUANT_FWD_COMPILE_OPTIONS})

 set(EXAMPLE_ADD_RMSNORM2D_RDQUANT_FWD "tile_example_add_rmsnorm2d_rdquant_fwd")
-add_executable(${EXAMPLE_ADD_RMSNORM2D_RDQUANT_FWD} EXCLUDE_FROM_ALL example_add_rmsnorm2d_rdquant_fwd.cpp)
+add_executable(${EXAMPLE_ADD_RMSNORM2D_RDQUANT_FWD} example_add_rmsnorm2d_rdquant_fwd.cpp)
 target_compile_options(${EXAMPLE_ADD_RMSNORM2D_RDQUANT_FWD} PRIVATE ${TILE_ADD_RMSNORM2D_RDQUANT_FWD_COMPILE_OPTIONS})

 # TODO: we have to turn off this global prop, otherwise the progress bar generated
--- a/example/ck_tile/12_smoothquant/CMakeLists.txt
+++ b/example/ck_tile/12_smoothquant/CMakeLists.txt
@@ -2,7 +2,7 @@ function (add_smoothquant_example TARGET_NAME MAIN_SRC)
    message(DEBUG "adding ${TARGET_NAME}")
    # not using add_example_executable() to add target, since we don't want this to have
    # to be included in "make all/install/check"
-    add_executable(${TARGET_NAME} EXCLUDE_FROM_ALL ${MAIN_SRC})
+    add_executable(${TARGET_NAME} ${MAIN_SRC})
    target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_LIST_DIR})

    foreach(source IN LISTS ARGN)
--- a/example/ck_tile/13_moe_sorting/CMakeLists.txt
+++ b/example/ck_tile/13_moe_sorting/CMakeLists.txt
@@ -1,4 +1,4 @@
-add_executable(tile_example_moe_sorting EXCLUDE_FROM_ALL moe_sorting.cpp moe_sorting_api.cpp)
+add_executable(tile_example_moe_sorting moe_sorting.cpp moe_sorting_api.cpp)
 target_include_directories(tile_example_moe_sorting PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/)

 set(EXAMPLE_MOE_SORTING_COMPILE_OPTIONS)
--- a/example/ck_tile/14_moe_smoothquant/CMakeLists.txt
+++ b/example/ck_tile/14_moe_smoothquant/CMakeLists.txt
@@ -2,7 +2,7 @@ function (add_moe_smoothquant_example TARGET_NAME MAIN_SRC)
    message(DEBUG "adding ${TARGET_NAME}")
    # not using add_example_executable() to add target, since we don't want this to have
    # to be included in "make all/install/check"
-    add_executable(${TARGET_NAME} EXCLUDE_FROM_ALL ${MAIN_SRC})
+    add_executable(${TARGET_NAME} ${MAIN_SRC})
    target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_LIST_DIR})

    foreach(source IN LISTS ARGN)
@@ -22,4 +22,3 @@ endfunction(add_moe_smoothquant_example TARGET_NAME MAIN_SRC)
 file(GLOB INSTANCE_SRCS instances/*.cpp)

 add_moe_smoothquant_example(tile_example_moe_smoothquant moe_smoothquant.cpp ${INSTANCE_SRCS})
-
--- a/example/ck_tile/15_fused_moe/CMakeLists.txt
+++ b/example/ck_tile/15_fused_moe/CMakeLists.txt
@@ -1,19 +1,19 @@
-set(TILE_EXAPMLE_FUSED_MOE "tile_example_fused_moe")
-# not using add_example_executable() to add this target, since we don't want this to have
-# to be included in "make all/install/check"
-message(DEBUG "adding ${TILE_EXAPMLE_FUSED_MOE}")
-file(GLOB INSTANCE_SRCS instances/*.cpp)
-add_executable(${TILE_EXAPMLE_FUSED_MOE} EXCLUDE_FROM_ALL main.cpp)
-target_include_directories(${TILE_EXAPMLE_FUSED_MOE} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
-target_sources(${TILE_EXAPMLE_FUSED_MOE} PRIVATE ${INSTANCE_SRCS})
+if(GPU_TARGETS MATCHES "gfx94|gfx95")
+  set(TILE_EXAMPLE_FUSED_MOE "tile_example_fused_moe")
+  message(DEBUG "adding ${TILE_EXAMPLE_FUSED_MOE}")
+  file(GLOB INSTANCE_SRCS instances/*.cpp)
+  add_executable(${TILE_EXAMPLE_FUSED_MOE} main.cpp)
+  target_include_directories(${TILE_EXAMPLE_FUSED_MOE} PRIVATE ${CMAKE_CURRENT_LIST_DIR})
+  target_sources(${TILE_EXAMPLE_FUSED_MOE} PRIVATE ${INSTANCE_SRCS})

-set(TILE_EXAPMLE_FUSED_MOE_COMPILE_OPTIONS)
+  set(TILE_EXAMPLE_FUSED_MOE_COMPILE_OPTIONS)

-# NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
-list(APPEND TILE_EXAPMLE_FUSED_MOE_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal)
-list(APPEND TILE_EXAPMLE_FUSED_MOE_COMPILE_OPTIONS -DCK_TILE_BUFFER_LOAD_AGPR=1) # TODO: enable load to a
-list(APPEND TILE_EXAPMLE_FUSED_MOE_COMPILE_OPTIONS -DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=4) # rta
-# list(APPEND TILE_EXAPMLE_FUSED_MOE_COMPILE_OPTIONS  -mllvm -greedy-reverse-local-assignment=1)
-# list(APPEND TILE_EXAPMLE_FUSED_MOE_COMPILE_OPTIONS -v --save-temps -Wno-gnu-line-marker)
+  # NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
+  list(APPEND TILE_EXAMPLE_FUSED_MOE_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal)
+  list(APPEND TILE_EXAMPLE_FUSED_MOE_COMPILE_OPTIONS -DCK_TILE_BUFFER_LOAD_AGPR=1) # TODO: enable load to a
+  list(APPEND TILE_EXAMPLE_FUSED_MOE_COMPILE_OPTIONS -DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=4) # rta
+  # list(APPEND TILE_EXAMPLE_FUSED_MOE_COMPILE_OPTIONS  -mllvm -greedy-reverse-local-assignment=1)
+  # list(APPEND TILE_EXAMPLE_FUSED_MOE_COMPILE_OPTIONS -v --save-temps -Wno-gnu-line-marker)

-target_compile_options(${TILE_EXAPMLE_FUSED_MOE} PRIVATE ${TILE_EXAPMLE_FUSED_MOE_COMPILE_OPTIONS})
+  target_compile_options(${TILE_EXAMPLE_FUSED_MOE} PRIVATE ${TILE_EXAMPLE_FUSED_MOE_COMPILE_OPTIONS})
+endif()
--- a/example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp
+++ b/example/ck_tile/15_fused_moe/instances/fused_moesorting_api.cpp
@@ -402,22 +402,6 @@ float fused_moesorting_mp(fused_moesorting_trait t,
        using ms_index_t     = ck_tile::index_t;
        using ms_weight_type = float;

-        auto maybe_clear_workspace = [=](const ck_tile::stream_config& s_) {
-            if(t.clear_workspace_inside_api)
-            {
-                if(is_local_token)
-                {
-                    auto k = MOR_SORTING_CLEAR_WS_DISPATCH_(true, 1024, 1);
-                    k(s_);
-                }
-                else
-                {
-                    auto k = MOR_SORTING_CLEAR_WS_DISPATCH_(false, 1024, 1);
-                    k(s_);
-                }
-            }
-        };
-
        if(a.tokens < 2048)
        {
            if(ck_tile::impl::moe_sorting_get_smem_size_p23(a.num_experts) >
--- a/example/ck_tile/16_batched_gemm/CMakeLists.txt
+++ b/example/ck_tile/16_batched_gemm/CMakeLists.txt
@@ -1 +1 @@
-add_executable(tile_example_batched_gemm EXCLUDE_FROM_ALL batched_gemm.cpp)
+add_executable(tile_example_batched_gemm batched_gemm.cpp)
--- a/example/ck_tile/17_grouped_gemm/CMakeLists.txt
+++ b/example/ck_tile/17_grouped_gemm/CMakeLists.txt
@@ -1,12 +1,14 @@
-add_executable(tile_example_grouped_gemm EXCLUDE_FROM_ALL grouped_gemm.cpp)
-add_executable(tile_example_quant_grouped_gemm EXCLUDE_FROM_ALL quant_grouped_gemm.cpp)
-add_executable(tile_example_grouped_gemm_preshuffle EXCLUDE_FROM_ALL grouped_gemm_preshuffle.cpp)
-add_executable(tile_example_grouped_gemm_multi_d EXCLUDE_FROM_ALL grouped_gemm_multi_d.cpp)
-set(EXAMPLE_GEMM_COMPILE_OPTIONS)
-if(CK_USE_OCP_FP8)
-  list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
+if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx90a")
+  add_executable(tile_example_grouped_gemm grouped_gemm.cpp)
+  add_executable(tile_example_quant_grouped_gemm quant_grouped_gemm.cpp)
+  add_executable(tile_example_grouped_gemm_preshuffle grouped_gemm_preshuffle.cpp)
+  add_executable(tile_example_grouped_gemm_multi_d grouped_gemm_multi_d.cpp)
+  set(EXAMPLE_GEMM_COMPILE_OPTIONS)
+  if(CK_USE_OCP_FP8)
+    list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
+  endif()
+  target_compile_options(tile_example_grouped_gemm PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+  target_compile_options(tile_example_grouped_gemm_preshuffle PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+  target_compile_options(tile_example_grouped_gemm_multi_d PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+  target_compile_options(tile_example_quant_grouped_gemm PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
 endif()
-target_compile_options(tile_example_grouped_gemm PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
-target_compile_options(tile_example_grouped_gemm_preshuffle PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
-target_compile_options(tile_example_grouped_gemm_multi_d PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
-target_compile_options(tile_example_quant_grouped_gemm PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
--- a/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc
+++ b/example/ck_tile/17_grouped_gemm/run_grouped_gemm_example.inc
@@ -233,7 +233,8 @@ int run_grouped_gemm_example_with_layouts(int argc,
        // Perform preshuffle for B tensor
        if constexpr(GemmConfig::Preshuffle)
        {
-            ck_tile::HostTensor<BDataType> b_shuffle_host = shuffle_b<GemmConfig>(b_k_n_tensors[i]);
+            ck_tile::HostTensor<BDataType> b_shuffle_host =
+                ck_tile::shuffle_b<GemmConfig>(b_k_n_tensors[i]);
            b_k_n_dev_buf.push_back(std::make_unique<ck_tile::DeviceMem>(b_shuffle_host));
        }
        else
--- a/example/ck_tile/18_flatmm/CMakeLists.txt
+++ b/example/ck_tile/18_flatmm/CMakeLists.txt
@@ -9,18 +9,6 @@ foreach(gpu IN LISTS GPU_TARGETS)
 endforeach()

 if(has_supported_gpu)
-    add_executable(tile_example_flatmm_basic EXCLUDE_FROM_ALL flatmm_basic.cpp)
-    add_executable(tile_example_mixed_prec_flatmm EXCLUDE_FROM_ALL mixed_prec/mixed_prec_flatmm.cpp)
-    add_executable(tile_example_moe_flatmm EXCLUDE_FROM_ALL moe_flatmm.cpp)
-    add_executable(tile_example_a16w4_moe_flatmm EXCLUDE_FROM_ALL mixed_prec/a16w4_moe_flatmm.cpp)
-    add_executable(tile_example_grouped_flatmm EXCLUDE_FROM_ALL grouped_flatmm.cpp)
-
-    include(mxgemm/mx_flatmm_instance.cmake)
-    mx_flatmm_instance_generate(EXAMPLE_MX_FLATMM_FILES)
-    message(STATUS "Generated MX FlatMM kernel files: ${EXAMPLE_MX_FLATMM_FILES}")
-    add_executable(tile_example_mx_flatmm EXCLUDE_FROM_ALL mxgemm/mx_flatmm.cpp ${EXAMPLE_MX_FLATMM_FILES})
-    target_include_directories(tile_example_mx_flatmm PRIVATE mxgemm)
-
    # NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
    #       ... because they are auto-generated
    set(EXAMPLE_FLATMM_COMPILE_OPTIONS -Wno-undefined-func-template)
@@ -30,11 +18,28 @@ if(has_supported_gpu)
        list(APPEND EXAMPLE_FLATMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
    endif()

+    add_executable(tile_example_flatmm_basic flatmm_basic.cpp)
    target_compile_options(tile_example_flatmm_basic PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS})
-    target_compile_options(tile_example_mixed_prec_flatmm PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS})
-    target_compile_options(tile_example_moe_flatmm PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS})
-    target_compile_options(tile_example_a16w4_moe_flatmm PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS})
-    target_compile_options(tile_example_grouped_flatmm PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS})
-    target_compile_options(tile_example_mx_flatmm PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS}) # TODO: 950 only
-endif()

+    add_executable(tile_example_moe_flatmm moe_flatmm.cpp)
+    target_compile_options(tile_example_moe_flatmm PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS})
+
+    add_executable(tile_example_grouped_flatmm grouped_flatmm.cpp)
+    target_compile_options(tile_example_grouped_flatmm PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS})
+
+    if (GPU_TARGETS MATCHES "gfx95")
+        add_executable(tile_example_mixed_prec_flatmm mixed_prec/mixed_prec_flatmm.cpp)
+        target_compile_options(tile_example_mixed_prec_flatmm PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS})
+
+        add_executable(tile_example_a16w4_moe_flatmm mixed_prec/a16w4_moe_flatmm.cpp)
+        target_compile_options(tile_example_a16w4_moe_flatmm PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS})
+
+        include(mxgemm/mx_flatmm_instance.cmake)
+        mx_flatmm_instance_generate(EXAMPLE_MX_FLATMM_FILES)
+        message(STATUS "Generated MX FlatMM kernel files: ${EXAMPLE_MX_FLATMM_FILES}")
+
+        add_executable(tile_example_mx_flatmm mxgemm/mx_flatmm.cpp ${EXAMPLE_MX_FLATMM_FILES})
+        target_include_directories(tile_example_mx_flatmm PRIVATE mxgemm)
+        target_compile_options(tile_example_mx_flatmm PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS})
+    endif()
+endif()
--- a/example/ck_tile/18_flatmm/run_grouped_flatmm_example.inc
+++ b/example/ck_tile/18_flatmm/run_grouped_flatmm_example.inc
@@ -215,7 +215,7 @@ int run_contiguous_grouped_flatmm_example_with_layouts(
    assert(N % N_Warp_Tile == 0 &&
           "N must be divisible by N_Warp_Tile for contiguous grouped gemm");
    ck_tile::HostTensor<BDataType> b_shuffle_host =
-        shuffle_b<FlatmmConfig, BDataType>(b_k_n_tensor);
+        ck_tile::shuffle_b<FlatmmConfig, BDataType>(b_k_n_tensor);

    std::unique_ptr<ck_tile::DeviceMem> a_m_k_dev_buf(
        std::make_unique<ck_tile::DeviceMem>(a_m_k_tensor.get_element_space_size_in_bytes()));
@@ -431,7 +431,7 @@ int run_masked_grouped_flatmm_example_with_layouts(
    assert(N % N_Warp_Tile == 0 &&
           "N must be divisible by N_Warp_Tile for contiguous grouped gemm");
    ck_tile::HostTensor<BDataType> b_shuffle_host =
-        shuffle_b<FlatmmConfig, BDataType>(b_k_n_tensor);
+        ck_tile::shuffle_b<FlatmmConfig, BDataType>(b_k_n_tensor);

    std::unique_ptr<ck_tile::DeviceMem> a_m_k_dev_buf(
        std::make_unique<ck_tile::DeviceMem>(a_m_k_tensor.get_element_space_size_in_bytes()));
--- a/example/ck_tile/18_flatmm/run_moe_flatmm_example.inc
+++ b/example/ck_tile/18_flatmm/run_moe_flatmm_example.inc
@@ -302,10 +302,6 @@ int run_moe_gemm_example_with_layouts(int argc,
            static_cast<float*>(per_token_scale_dev_buf.GetDeviceBuffer()),
            static_cast<float*>(per_channel_scale_dev_buf.GetDeviceBuffer()));

-        const float max_accumulated_value =
-            *std::max_element(c_m_n_host_ref.mData.begin(), c_m_n_host_ref.mData.end());
-        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
-            K, 1 /*kbatch*/, max_accumulated_value);
        c_m_n_ref_buf->FromDevice(c_m_n_host_ref.data());

        const float rtol = std::is_same_v<ADataType, ck_tile::half_t> && IsInputGemm ? 1e-3 : 1e-2;
--- a/example/ck_tile/19_gemm_multi_d/CMakeLists.txt
+++ b/example/ck_tile/19_gemm_multi_d/CMakeLists.txt
@@ -1,4 +1,4 @@
-add_executable(tile_example_gemm_multi_d_fp16 EXCLUDE_FROM_ALL gemm_multi_d_fp16.cpp)
+add_executable(tile_example_gemm_multi_d_fp16 gemm_multi_d_fp16.cpp)
 set(EXAMPLE_GEMM_COMPILE_OPTIONS)
 if(CK_USE_OCP_FP8)
    list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
--- a/example/ck_tile/20_grouped_convolution/CMakeLists.txt
+++ b/example/ck_tile/20_grouped_convolution/CMakeLists.txt
@@ -1,20 +1,22 @@
-set(EXAMPLE_CONV_COMPILE_OPTIONS)
-list(APPEND EXAMPLE_CONV_COMPILE_OPTIONS -mllvm -enable-noalias-to-md-conversion=0)
+if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx90a")
+  set(EXAMPLE_CONV_COMPILE_OPTIONS)
+  list(APPEND EXAMPLE_CONV_COMPILE_OPTIONS -mllvm -enable-noalias-to-md-conversion=0)

-add_executable(tile_example_grouped_conv_fwd EXCLUDE_FROM_ALL grouped_convolution_forward.cpp)
-target_compile_options(tile_example_grouped_conv_fwd PRIVATE ${EXAMPLE_CONV_COMPILE_OPTIONS})
+  add_executable(tile_example_grouped_conv_fwd grouped_convolution_forward.cpp)
+  target_compile_options(tile_example_grouped_conv_fwd PRIVATE ${EXAMPLE_CONV_COMPILE_OPTIONS})

-add_executable(tile_example_grouped_conv_fwd_large_tensor EXCLUDE_FROM_ALL grouped_convolution_forward_large_tensor.cpp)
-target_compile_options(tile_example_grouped_conv_fwd_large_tensor PRIVATE ${EXAMPLE_CONV_COMPILE_OPTIONS})
+  add_executable(tile_example_grouped_conv_fwd_large_tensor grouped_convolution_forward_large_tensor.cpp)
+  target_compile_options(tile_example_grouped_conv_fwd_large_tensor PRIVATE ${EXAMPLE_CONV_COMPILE_OPTIONS})

-add_executable(tile_example_grouped_conv_fwd_bias_clamp EXCLUDE_FROM_ALL grouped_convolution_forward_bias_clamp.cpp)
-target_compile_options(tile_example_grouped_conv_fwd_bias_clamp PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+  add_executable(tile_example_grouped_conv_fwd_bias_clamp grouped_convolution_forward_bias_clamp.cpp)
+  target_compile_options(tile_example_grouped_conv_fwd_bias_clamp PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})

-add_executable(tile_example_grouped_conv_bwd_weight EXCLUDE_FROM_ALL grouped_convolution_backward_weight.cpp)
-target_compile_options(tile_example_grouped_conv_bwd_weight PRIVATE ${EXAMPLE_CONV_COMPILE_OPTIONS})
+  add_executable(tile_example_grouped_conv_bwd_weight grouped_convolution_backward_weight.cpp)
+  target_compile_options(tile_example_grouped_conv_bwd_weight PRIVATE ${EXAMPLE_CONV_COMPILE_OPTIONS})

-add_executable(tile_example_grouped_conv_bwd_weight_two_stage EXCLUDE_FROM_ALL grouped_convolution_backward_weight_two_stage.cpp)
-target_compile_options(tile_example_grouped_conv_bwd_weight_two_stage PRIVATE ${EXAMPLE_CONV_COMPILE_OPTIONS})
+  add_executable(tile_example_grouped_conv_bwd_weight_two_stage grouped_convolution_backward_weight_two_stage.cpp)
+  target_compile_options(tile_example_grouped_conv_bwd_weight_two_stage PRIVATE ${EXAMPLE_CONV_COMPILE_OPTIONS})

-add_executable(tile_example_grouped_conv_bwd_data EXCLUDE_FROM_ALL grouped_convolution_backward_data.cpp)
-target_compile_options(tile_example_grouped_conv_bwd_data PRIVATE ${EXAMPLE_CONV_COMPILE_OPTIONS})
+  add_executable(tile_example_grouped_conv_bwd_data grouped_convolution_backward_data.cpp)
+  target_compile_options(tile_example_grouped_conv_bwd_data PRIVATE ${EXAMPLE_CONV_COMPILE_OPTIONS})
+endif()
--- a/example/ck_tile/20_grouped_convolution/grouped_convolution_forward_large_tensor_invoker.hpp
+++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_forward_large_tensor_invoker.hpp
@@ -101,7 +101,6 @@ struct GroupedConvolutionForwardInvoker
        const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(K_split);
        const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
        const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
-        float ave_time{0};

        using TransformType =
            ck_tile::TransformConvFwdToGemm<NDimSpatial,
@@ -242,13 +241,15 @@ struct GroupedConvolutionForwardInvoker
        // =====================================================================
        // Kernel launch lambda: Uses EnableSplitImage based on layout support
        // =====================================================================
-        const auto Run = [&]<bool EnableSplitImage>(const auto has_hot_loop_,
-                                                    const auto tail_number_,
-                                                    const auto memory_operation_) {
+        const auto Run = [&](const auto has_hot_loop_,
+                             const auto tail_number_,
+                             const auto memory_operation_,
+                             const auto enable_split_image_) {
            constexpr bool has_hot_loop_v   = has_hot_loop_.value;
            constexpr auto tail_number_v    = tail_number_.value;
            constexpr auto scheduler        = ConvConfig::Scheduler;
            constexpr auto memory_operation = memory_operation_.value;
+            constexpr bool EnableSplitImage = enable_split_image_.value;

            using GroupedConvTraitsType = std::conditional_t<EnableSplitImage,
                                                             GroupedConvTraitsTypeLargeTensor,
@@ -357,11 +358,9 @@ struct GroupedConvolutionForwardInvoker
                          << ", Vector size C: " << ConvEpilogue::GetVectorSizeC() << std::endl;
            }

-            ave_time = ck_tile::launch_kernel(
+            return ck_tile::launch_kernel(
                s,
                ck_tile::make_kernel<ConvConfig::kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
-
-            return ave_time;
        };

        // =====================================================================
@@ -369,28 +368,33 @@ struct GroupedConvolutionForwardInvoker
        // =====================================================================
        if(use_split_image)
        {
-            // Use split-image kernel (Kernel<true>)
            const auto RunSplitImage = [&](const auto has_hot_loop_, const auto tail_number_) {
                if(args.k_batch == 1)
-                    Run.template operator()<true>(has_hot_loop_, tail_number_, MemoryOpSet{});
+                    return Run(
+                        has_hot_loop_, tail_number_, MemoryOpSet{}, ck_tile::bool_constant<true>{});
                else
-                    Run.template operator()<true>(has_hot_loop_, tail_number_, MemoryOpAtomicAdd{});
+                    return Run(has_hot_loop_,
+                               tail_number_,
+                               MemoryOpAtomicAdd{},
+                               ck_tile::bool_constant<true>{});
            };
-            BaseGemmPipeline::TailHandler(RunSplitImage, has_hot_loop, tail_num);
+            return BaseGemmPipeline::TailHandler(RunSplitImage, has_hot_loop, tail_num);
        }
        else
        {
-            // Use regular kernel (Kernel<false>)
            const auto RunRegular = [&](const auto has_hot_loop_, const auto tail_number_) {
                if(args.k_batch == 1)
-                    Run.template operator()<false>(has_hot_loop_, tail_number_, MemoryOpSet{});
+                    return Run(has_hot_loop_,
+                               tail_number_,
+                               MemoryOpSet{},
+                               ck_tile::bool_constant<false>{});
                else
-                    Run.template operator()<false>(
-                        has_hot_loop_, tail_number_, MemoryOpAtomicAdd{});
+                    return Run(has_hot_loop_,
+                               tail_number_,
+                               MemoryOpAtomicAdd{},
+                               ck_tile::bool_constant<false>{});
            };
-            BaseGemmPipeline::TailHandler(RunRegular, has_hot_loop, tail_num);
+            return BaseGemmPipeline::TailHandler(RunRegular, has_hot_loop, tail_num);
        }
-
-        return ave_time;
    }
 };
--- a/example/ck_tile/22_gemm_multi_abd/CMakeLists.txt
+++ b/example/ck_tile/22_gemm_multi_abd/CMakeLists.txt
@@ -1 +1 @@
-add_executable(tile_example_gemm_multi_abd_fp16 EXCLUDE_FROM_ALL gemm_multi_abd_fp16.cpp)
+add_executable(tile_example_gemm_multi_abd_fp16 gemm_multi_abd_fp16.cpp)
--- a/example/ck_tile/35_batched_transpose/CMakeLists.txt
+++ b/example/ck_tile/35_batched_transpose/CMakeLists.txt
@@ -1,9 +1,10 @@
-set(TARGET_NAME tile_example_batched_transpose)
-add_executable(${TARGET_NAME} EXCLUDE_FROM_ALL batched_transpose_example.cpp batched_transpose_api.cpp)
-target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/)
-
-# NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
-list(APPEND EXAMPLE_BATCHED_TRANSPOSE_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal)
-# list(APPEND EXAMPLE_BATCHED_TRANSPOSE_COMPILE_OPTIONS -v --save-temps -Wno-gnu-line-marker)
-target_compile_options(tile_example_batched_transpose PRIVATE ${EXAMPLE_BATCHED_TRANSPOSE_COMPILE_OPTIONS})
+if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx90a")
+  set(TARGET_NAME tile_example_batched_transpose)
+  add_executable(${TARGET_NAME} batched_transpose_example.cpp batched_transpose_api.cpp)
+  target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/)

+  # NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
+  list(APPEND EXAMPLE_BATCHED_TRANSPOSE_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal)
+  # list(APPEND EXAMPLE_BATCHED_TRANSPOSE_COMPILE_OPTIONS -v --save-temps -Wno-gnu-line-marker)
+  target_compile_options(tile_example_batched_transpose PRIVATE ${EXAMPLE_BATCHED_TRANSPOSE_COMPILE_OPTIONS})
+endif()
--- a/example/ck_tile/36_pooling/CMakeLists.txt
+++ b/example/ck_tile/36_pooling/CMakeLists.txt
@@ -1,8 +1,7 @@
 set(EXAMPLE_POOL_3D "tile_example_pool3d")
 message(DEBUG "adding example ${EXAMPLE_POOL_3D}")

-add_executable(${EXAMPLE_POOL_3D} EXCLUDE_FROM_ALL pool3d.cpp)
+add_executable(${EXAMPLE_POOL_3D} pool3d.cpp)
 target_include_directories(${EXAMPLE_POOL_3D} PRIVATE ${CMAKE_CURRENT_LIST_DIR})

 target_compile_options(${EXAMPLE_POOL_3D} PRIVATE ${EXAMPLE_POOL_COMPILE_OPTIONS})
-
--- a/example/ck_tile/38_block_scale_gemm/CMakeLists.txt
+++ b/example/ck_tile/38_block_scale_gemm/CMakeLists.txt
@@ -7,7 +7,7 @@ list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -mllvm -enable-noalias-to-md-conversion

 if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx12")
    set(EXE_NAME tile_example_gemm_quant)
-    add_executable(${EXE_NAME} EXCLUDE_FROM_ALL 
+    add_executable(${EXE_NAME}
        gemm_quant.cpp
        gemm_aquant_quantgrouped.cpp
        gemm_aquant_quantgrouped_preshufflequant.cpp
--- a/example/ck_tile/40_streamk_gemm/CMakeLists.txt
+++ b/example/ck_tile/40_streamk_gemm/CMakeLists.txt
@@ -1,5 +1,5 @@
 if(GPU_TARGETS MATCHES "gfx9")
-    add_executable(tile_example_streamk_gemm_basic EXCLUDE_FROM_ALL streamk_gemm_basic.cpp)
+    add_executable(tile_example_streamk_gemm_basic streamk_gemm_basic.cpp)
    set(EXAMPLE_GEMM_COMPILE_OPTIONS)
    if(CK_USE_OCP_FP8)
 	    list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
--- a/example/ck_tile/41_batched_contraction/CMakeLists.txt
+++ b/example/ck_tile/41_batched_contraction/CMakeLists.txt
@@ -1,4 +1,4 @@
-add_executable(tile_example_batched_contraction EXCLUDE_FROM_ALL batched_contraction.cpp)
+add_executable(tile_example_batched_contraction batched_contraction.cpp)
 set(EXAMPLE_CONTRACTION_COMPILE_OPTIONS)
 if(CK_USE_OCP_FP8)
    list(APPEND EXAMPLE_CONTRACTION_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
--- a/example/ck_tile/41_batched_contraction/run_batched_contraction_example.inc
+++ b/example/ck_tile/41_batched_contraction/run_batched_contraction_example.inc
@@ -316,20 +316,20 @@ int run_batched_contraction_example_with_layouts(

        auto start_time = std::chrono::high_resolution_clock::now();

-        calculate_reference_flat_indexing<ADataType,
-                                          BDataType,
-                                          DDataType,
-                                          EDataType,
-                                          AccDataType,
-                                          CDEElementWise>(a_full_dims_host,
-                                                          b_full_dims_host,
-                                                          ds_full_dims_host,
-                                                          e_full_dims_host_ref,
-                                                          G_total,
-                                                          M_total,
-                                                          N_total,
-                                                          K_total,
-                                                          CDEElementWise{});
+        ck_tile::calculate_reference_flat_indexing<ADataType,
+                                                   BDataType,
+                                                   DDataType,
+                                                   EDataType,
+                                                   AccDataType,
+                                                   CDEElementWise>(a_full_dims_host,
+                                                                   b_full_dims_host,
+                                                                   ds_full_dims_host,
+                                                                   e_full_dims_host_ref,
+                                                                   G_total,
+                                                                   M_total,
+                                                                   N_total,
+                                                                   K_total,
+                                                                   CDEElementWise{});

        auto end_time = std::chrono::high_resolution_clock::now();
        auto duration =