[CK Tile] enable building examples by default (#3259)

* remove EXCLUDE_FROM_ALL from ck-tile examples -> +15 min build time w/ 64 threads for a single arch * fix cpp17 compile error in the ck-tile examples --------- Co-authored-by: khuagarw <khuagarw@amd.com> Co-authored-by: Ding, Yi <yi.ding@amd.com>
2026-05-05 14:11:29 +00:00 · 2025-11-26 16:24:44 -08:00
parent 40d7217ac7
commit 79aae7c7f7
39 changed files with 175 additions and 174 deletions
--- a/example/ck_tile/18_flatmm/CMakeLists.txt
+++ b/example/ck_tile/18_flatmm/CMakeLists.txt
@@ -9,18 +9,6 @@ foreach(gpu IN LISTS GPU_TARGETS)
 endforeach()

 if(has_supported_gpu)
-    add_executable(tile_example_flatmm_basic EXCLUDE_FROM_ALL flatmm_basic.cpp)
-    add_executable(tile_example_mixed_prec_flatmm EXCLUDE_FROM_ALL mixed_prec/mixed_prec_flatmm.cpp)
-    add_executable(tile_example_moe_flatmm EXCLUDE_FROM_ALL moe_flatmm.cpp)
-    add_executable(tile_example_a16w4_moe_flatmm EXCLUDE_FROM_ALL mixed_prec/a16w4_moe_flatmm.cpp)
-    add_executable(tile_example_grouped_flatmm EXCLUDE_FROM_ALL grouped_flatmm.cpp)
-
-    include(mxgemm/mx_flatmm_instance.cmake)
-    mx_flatmm_instance_generate(EXAMPLE_MX_FLATMM_FILES)
-    message(STATUS "Generated MX FlatMM kernel files: ${EXAMPLE_MX_FLATMM_FILES}")
-    add_executable(tile_example_mx_flatmm EXCLUDE_FROM_ALL mxgemm/mx_flatmm.cpp ${EXAMPLE_MX_FLATMM_FILES})
-    target_include_directories(tile_example_mx_flatmm PRIVATE mxgemm)
-
    # NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
    #       ... because they are auto-generated
    set(EXAMPLE_FLATMM_COMPILE_OPTIONS -Wno-undefined-func-template)
@@ -30,11 +18,28 @@ if(has_supported_gpu)
        list(APPEND EXAMPLE_FLATMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
    endif()

+    add_executable(tile_example_flatmm_basic flatmm_basic.cpp)
    target_compile_options(tile_example_flatmm_basic PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS})
-    target_compile_options(tile_example_mixed_prec_flatmm PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS})
-    target_compile_options(tile_example_moe_flatmm PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS})
-    target_compile_options(tile_example_a16w4_moe_flatmm PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS})
-    target_compile_options(tile_example_grouped_flatmm PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS})
-    target_compile_options(tile_example_mx_flatmm PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS}) # TODO: 950 only
-endif()

+    add_executable(tile_example_moe_flatmm moe_flatmm.cpp)
+    target_compile_options(tile_example_moe_flatmm PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS})
+
+    add_executable(tile_example_grouped_flatmm grouped_flatmm.cpp)
+    target_compile_options(tile_example_grouped_flatmm PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS})
+
+    if (GPU_TARGETS MATCHES "gfx95")
+        add_executable(tile_example_mixed_prec_flatmm mixed_prec/mixed_prec_flatmm.cpp)
+        target_compile_options(tile_example_mixed_prec_flatmm PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS})
+
+        add_executable(tile_example_a16w4_moe_flatmm mixed_prec/a16w4_moe_flatmm.cpp)
+        target_compile_options(tile_example_a16w4_moe_flatmm PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS})
+
+        include(mxgemm/mx_flatmm_instance.cmake)
+        mx_flatmm_instance_generate(EXAMPLE_MX_FLATMM_FILES)
+        message(STATUS "Generated MX FlatMM kernel files: ${EXAMPLE_MX_FLATMM_FILES}")
+
+        add_executable(tile_example_mx_flatmm mxgemm/mx_flatmm.cpp ${EXAMPLE_MX_FLATMM_FILES})
+        target_include_directories(tile_example_mx_flatmm PRIVATE mxgemm)
+        target_compile_options(tile_example_mx_flatmm PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS})
+    endif()
+endif()
--- a/example/ck_tile/18_flatmm/run_grouped_flatmm_example.inc
+++ b/example/ck_tile/18_flatmm/run_grouped_flatmm_example.inc
@@ -215,7 +215,7 @@ int run_contiguous_grouped_flatmm_example_with_layouts(
    assert(N % N_Warp_Tile == 0 &&
           "N must be divisible by N_Warp_Tile for contiguous grouped gemm");
    ck_tile::HostTensor<BDataType> b_shuffle_host =
-        shuffle_b<FlatmmConfig, BDataType>(b_k_n_tensor);
+        ck_tile::shuffle_b<FlatmmConfig, BDataType>(b_k_n_tensor);

    std::unique_ptr<ck_tile::DeviceMem> a_m_k_dev_buf(
        std::make_unique<ck_tile::DeviceMem>(a_m_k_tensor.get_element_space_size_in_bytes()));
@@ -431,7 +431,7 @@ int run_masked_grouped_flatmm_example_with_layouts(
    assert(N % N_Warp_Tile == 0 &&
           "N must be divisible by N_Warp_Tile for contiguous grouped gemm");
    ck_tile::HostTensor<BDataType> b_shuffle_host =
-        shuffle_b<FlatmmConfig, BDataType>(b_k_n_tensor);
+        ck_tile::shuffle_b<FlatmmConfig, BDataType>(b_k_n_tensor);

    std::unique_ptr<ck_tile::DeviceMem> a_m_k_dev_buf(
        std::make_unique<ck_tile::DeviceMem>(a_m_k_tensor.get_element_space_size_in_bytes()));
--- a/example/ck_tile/18_flatmm/run_moe_flatmm_example.inc
+++ b/example/ck_tile/18_flatmm/run_moe_flatmm_example.inc
@@ -302,10 +302,6 @@ int run_moe_gemm_example_with_layouts(int argc,
            static_cast<float*>(per_token_scale_dev_buf.GetDeviceBuffer()),
            static_cast<float*>(per_channel_scale_dev_buf.GetDeviceBuffer()));

-        const float max_accumulated_value =
-            *std::max_element(c_m_n_host_ref.mData.begin(), c_m_n_host_ref.mData.end());
-        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
-            K, 1 /*kbatch*/, max_accumulated_value);
        c_m_n_ref_buf->FromDevice(c_m_n_host_ref.data());

        const float rtol = std::is_same_v<ADataType, ck_tile::half_t> && IsInputGemm ? 1e-3 : 1e-2;