[CK Tile] enable building examples by default (#3259)

* remove EXCLUDE_FROM_ALL from ck-tile examples
-> +15 min build time w/ 64 threads for a single arch

* fix cpp17 compile error in the ck-tile examples

---------

Co-authored-by: khuagarw <khuagarw@amd.com>
Co-authored-by: Ding, Yi <yi.ding@amd.com>
This commit is contained in:
Max Podkorytov
2025-11-26 16:24:44 -08:00
committed by GitHub
parent 40d7217ac7
commit 79aae7c7f7
39 changed files with 175 additions and 174 deletions

View File

@@ -1,20 +1,22 @@
set(EXAMPLE_CONV_COMPILE_OPTIONS)
list(APPEND EXAMPLE_CONV_COMPILE_OPTIONS -mllvm -enable-noalias-to-md-conversion=0)
if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx90a")
set(EXAMPLE_CONV_COMPILE_OPTIONS)
list(APPEND EXAMPLE_CONV_COMPILE_OPTIONS -mllvm -enable-noalias-to-md-conversion=0)
add_executable(tile_example_grouped_conv_fwd EXCLUDE_FROM_ALL grouped_convolution_forward.cpp)
target_compile_options(tile_example_grouped_conv_fwd PRIVATE ${EXAMPLE_CONV_COMPILE_OPTIONS})
add_executable(tile_example_grouped_conv_fwd grouped_convolution_forward.cpp)
target_compile_options(tile_example_grouped_conv_fwd PRIVATE ${EXAMPLE_CONV_COMPILE_OPTIONS})
add_executable(tile_example_grouped_conv_fwd_large_tensor EXCLUDE_FROM_ALL grouped_convolution_forward_large_tensor.cpp)
target_compile_options(tile_example_grouped_conv_fwd_large_tensor PRIVATE ${EXAMPLE_CONV_COMPILE_OPTIONS})
add_executable(tile_example_grouped_conv_fwd_large_tensor grouped_convolution_forward_large_tensor.cpp)
target_compile_options(tile_example_grouped_conv_fwd_large_tensor PRIVATE ${EXAMPLE_CONV_COMPILE_OPTIONS})
add_executable(tile_example_grouped_conv_fwd_bias_clamp EXCLUDE_FROM_ALL grouped_convolution_forward_bias_clamp.cpp)
target_compile_options(tile_example_grouped_conv_fwd_bias_clamp PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
add_executable(tile_example_grouped_conv_fwd_bias_clamp grouped_convolution_forward_bias_clamp.cpp)
target_compile_options(tile_example_grouped_conv_fwd_bias_clamp PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
add_executable(tile_example_grouped_conv_bwd_weight EXCLUDE_FROM_ALL grouped_convolution_backward_weight.cpp)
target_compile_options(tile_example_grouped_conv_bwd_weight PRIVATE ${EXAMPLE_CONV_COMPILE_OPTIONS})
add_executable(tile_example_grouped_conv_bwd_weight grouped_convolution_backward_weight.cpp)
target_compile_options(tile_example_grouped_conv_bwd_weight PRIVATE ${EXAMPLE_CONV_COMPILE_OPTIONS})
add_executable(tile_example_grouped_conv_bwd_weight_two_stage EXCLUDE_FROM_ALL grouped_convolution_backward_weight_two_stage.cpp)
target_compile_options(tile_example_grouped_conv_bwd_weight_two_stage PRIVATE ${EXAMPLE_CONV_COMPILE_OPTIONS})
add_executable(tile_example_grouped_conv_bwd_weight_two_stage grouped_convolution_backward_weight_two_stage.cpp)
target_compile_options(tile_example_grouped_conv_bwd_weight_two_stage PRIVATE ${EXAMPLE_CONV_COMPILE_OPTIONS})
add_executable(tile_example_grouped_conv_bwd_data EXCLUDE_FROM_ALL grouped_convolution_backward_data.cpp)
target_compile_options(tile_example_grouped_conv_bwd_data PRIVATE ${EXAMPLE_CONV_COMPILE_OPTIONS})
add_executable(tile_example_grouped_conv_bwd_data grouped_convolution_backward_data.cpp)
target_compile_options(tile_example_grouped_conv_bwd_data PRIVATE ${EXAMPLE_CONV_COMPILE_OPTIONS})
endif()

View File

@@ -101,7 +101,6 @@ struct GroupedConvolutionForwardInvoker
const ck_tile::index_t num_loop = TilePartitioner::GetLoopNum(K_split);
const bool has_hot_loop = BaseGemmPipeline::BlockHasHotloop(num_loop);
const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
float ave_time{0};
using TransformType =
ck_tile::TransformConvFwdToGemm<NDimSpatial,
@@ -242,13 +241,15 @@ struct GroupedConvolutionForwardInvoker
// =====================================================================
// Kernel launch lambda: Uses EnableSplitImage based on layout support
// =====================================================================
const auto Run = [&]<bool EnableSplitImage>(const auto has_hot_loop_,
const auto tail_number_,
const auto memory_operation_) {
const auto Run = [&](const auto has_hot_loop_,
const auto tail_number_,
const auto memory_operation_,
const auto enable_split_image_) {
constexpr bool has_hot_loop_v = has_hot_loop_.value;
constexpr auto tail_number_v = tail_number_.value;
constexpr auto scheduler = ConvConfig::Scheduler;
constexpr auto memory_operation = memory_operation_.value;
constexpr bool EnableSplitImage = enable_split_image_.value;
using GroupedConvTraitsType = std::conditional_t<EnableSplitImage,
GroupedConvTraitsTypeLargeTensor,
@@ -357,11 +358,9 @@ struct GroupedConvolutionForwardInvoker
<< ", Vector size C: " << ConvEpilogue::GetVectorSizeC() << std::endl;
}
ave_time = ck_tile::launch_kernel(
return ck_tile::launch_kernel(
s,
ck_tile::make_kernel<ConvConfig::kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
return ave_time;
};
// =====================================================================
@@ -369,28 +368,33 @@ struct GroupedConvolutionForwardInvoker
// =====================================================================
if(use_split_image)
{
// Use split-image kernel (Kernel<true>)
const auto RunSplitImage = [&](const auto has_hot_loop_, const auto tail_number_) {
if(args.k_batch == 1)
Run.template operator()<true>(has_hot_loop_, tail_number_, MemoryOpSet{});
return Run(
has_hot_loop_, tail_number_, MemoryOpSet{}, ck_tile::bool_constant<true>{});
else
Run.template operator()<true>(has_hot_loop_, tail_number_, MemoryOpAtomicAdd{});
return Run(has_hot_loop_,
tail_number_,
MemoryOpAtomicAdd{},
ck_tile::bool_constant<true>{});
};
BaseGemmPipeline::TailHandler(RunSplitImage, has_hot_loop, tail_num);
return BaseGemmPipeline::TailHandler(RunSplitImage, has_hot_loop, tail_num);
}
else
{
// Use regular kernel (Kernel<false>)
const auto RunRegular = [&](const auto has_hot_loop_, const auto tail_number_) {
if(args.k_batch == 1)
Run.template operator()<false>(has_hot_loop_, tail_number_, MemoryOpSet{});
return Run(has_hot_loop_,
tail_number_,
MemoryOpSet{},
ck_tile::bool_constant<false>{});
else
Run.template operator()<false>(
has_hot_loop_, tail_number_, MemoryOpAtomicAdd{});
return Run(has_hot_loop_,
tail_number_,
MemoryOpAtomicAdd{},
ck_tile::bool_constant<false>{});
};
BaseGemmPipeline::TailHandler(RunRegular, has_hot_loop, tail_num);
return BaseGemmPipeline::TailHandler(RunRegular, has_hot_loop, tail_num);
}
return ave_time;
}
};