[CK Tile] enable building examples by default (#3259)

* remove EXCLUDE_FROM_ALL from ck-tile examples
-> +15 min build time w/ 64 threads for a single arch

* fix cpp17 compile error in the ck-tile examples

---------

Co-authored-by: khuagarw <khuagarw@amd.com>
Co-authored-by: Ding, Yi <yi.ding@amd.com>
This commit is contained in:
Max Podkorytov
2025-11-26 16:24:44 -08:00
committed by GitHub
parent 40d7217ac7
commit 79aae7c7f7
39 changed files with 175 additions and 174 deletions

View File

@@ -9,18 +9,6 @@ foreach(gpu IN LISTS GPU_TARGETS)
endforeach()
if(has_supported_gpu)
add_executable(tile_example_flatmm_basic EXCLUDE_FROM_ALL flatmm_basic.cpp)
add_executable(tile_example_mixed_prec_flatmm EXCLUDE_FROM_ALL mixed_prec/mixed_prec_flatmm.cpp)
add_executable(tile_example_moe_flatmm EXCLUDE_FROM_ALL moe_flatmm.cpp)
add_executable(tile_example_a16w4_moe_flatmm EXCLUDE_FROM_ALL mixed_prec/a16w4_moe_flatmm.cpp)
add_executable(tile_example_grouped_flatmm EXCLUDE_FROM_ALL grouped_flatmm.cpp)
include(mxgemm/mx_flatmm_instance.cmake)
mx_flatmm_instance_generate(EXAMPLE_MX_FLATMM_FILES)
message(STATUS "Generated MX FlatMM kernel files: ${EXAMPLE_MX_FLATMM_FILES}")
add_executable(tile_example_mx_flatmm EXCLUDE_FROM_ALL mxgemm/mx_flatmm.cpp ${EXAMPLE_MX_FLATMM_FILES})
target_include_directories(tile_example_mx_flatmm PRIVATE mxgemm)
# NOTE: we turn off undefined-func-template to let source compile without explicit declare function specializations
# ... because they are auto-generated
set(EXAMPLE_FLATMM_COMPILE_OPTIONS -Wno-undefined-func-template)
@@ -30,11 +18,28 @@ if(has_supported_gpu)
list(APPEND EXAMPLE_FLATMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
endif()
add_executable(tile_example_flatmm_basic flatmm_basic.cpp)
target_compile_options(tile_example_flatmm_basic PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS})
target_compile_options(tile_example_mixed_prec_flatmm PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS})
target_compile_options(tile_example_moe_flatmm PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS})
target_compile_options(tile_example_a16w4_moe_flatmm PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS})
target_compile_options(tile_example_grouped_flatmm PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS})
target_compile_options(tile_example_mx_flatmm PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS}) # TODO: 950 only
endif()
add_executable(tile_example_moe_flatmm moe_flatmm.cpp)
target_compile_options(tile_example_moe_flatmm PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS})
add_executable(tile_example_grouped_flatmm grouped_flatmm.cpp)
target_compile_options(tile_example_grouped_flatmm PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS})
if (GPU_TARGETS MATCHES "gfx95")
add_executable(tile_example_mixed_prec_flatmm mixed_prec/mixed_prec_flatmm.cpp)
target_compile_options(tile_example_mixed_prec_flatmm PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS})
add_executable(tile_example_a16w4_moe_flatmm mixed_prec/a16w4_moe_flatmm.cpp)
target_compile_options(tile_example_a16w4_moe_flatmm PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS})
include(mxgemm/mx_flatmm_instance.cmake)
mx_flatmm_instance_generate(EXAMPLE_MX_FLATMM_FILES)
message(STATUS "Generated MX FlatMM kernel files: ${EXAMPLE_MX_FLATMM_FILES}")
add_executable(tile_example_mx_flatmm mxgemm/mx_flatmm.cpp ${EXAMPLE_MX_FLATMM_FILES})
target_include_directories(tile_example_mx_flatmm PRIVATE mxgemm)
target_compile_options(tile_example_mx_flatmm PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS})
endif()
endif()

View File

@@ -215,7 +215,7 @@ int run_contiguous_grouped_flatmm_example_with_layouts(
assert(N % N_Warp_Tile == 0 &&
"N must be divisible by N_Warp_Tile for contiguous grouped gemm");
ck_tile::HostTensor<BDataType> b_shuffle_host =
shuffle_b<FlatmmConfig, BDataType>(b_k_n_tensor);
ck_tile::shuffle_b<FlatmmConfig, BDataType>(b_k_n_tensor);
std::unique_ptr<ck_tile::DeviceMem> a_m_k_dev_buf(
std::make_unique<ck_tile::DeviceMem>(a_m_k_tensor.get_element_space_size_in_bytes()));
@@ -431,7 +431,7 @@ int run_masked_grouped_flatmm_example_with_layouts(
assert(N % N_Warp_Tile == 0 &&
"N must be divisible by N_Warp_Tile for contiguous grouped gemm");
ck_tile::HostTensor<BDataType> b_shuffle_host =
shuffle_b<FlatmmConfig, BDataType>(b_k_n_tensor);
ck_tile::shuffle_b<FlatmmConfig, BDataType>(b_k_n_tensor);
std::unique_ptr<ck_tile::DeviceMem> a_m_k_dev_buf(
std::make_unique<ck_tile::DeviceMem>(a_m_k_tensor.get_element_space_size_in_bytes()));

View File

@@ -302,10 +302,6 @@ int run_moe_gemm_example_with_layouts(int argc,
static_cast<float*>(per_token_scale_dev_buf.GetDeviceBuffer()),
static_cast<float*>(per_channel_scale_dev_buf.GetDeviceBuffer()));
const float max_accumulated_value =
*std::max_element(c_m_n_host_ref.mData.begin(), c_m_n_host_ref.mData.end());
const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
K, 1 /*kbatch*/, max_accumulated_value);
c_m_n_ref_buf->FromDevice(c_m_n_host_ref.data());
const float rtol = std::is_same_v<ADataType, ck_tile::half_t> && IsInputGemm ? 1e-3 : 1e-2;