From a157e33311493c2db0cbf3164d04587ae815d5d3 Mon Sep 17 00:00:00 2001 From: John Shumway Date: Fri, 5 Dec 2025 07:44:10 -0800 Subject: [PATCH] Simplify includes for CK builder reflection (#3357) We only want to import enums and types into the builder reflection code. But, some of the enums are included in much larger files or even big trees of include files. This leads to unintended mixing of code and very confusing interactions and symbol conflicts. We organize the includes and extract two new enum-only headers to help with decoupling in CK. This refactoring is critical if we want to include reflection in a device-operator "describe" method. * Remove a few unnecessary includes from headers in builder/reflect/. * Extract enums scheduler and pipeline to their own headers so they can be used without importing other code. * Order includes alphabetically for better organization. The immediate goal is to unblock reflection integration, and this type of cleanup helps the flexibility and robustness of the CK header library. [ROCm/composable_kernel commit: f5b0af22722b130f03cac590ca9b8729b1b84991] --- .../ck_tile/builder/reflect/conv_traits.hpp | 26 +++--- .../builder/reflect/instance_traits_util.hpp | 42 +++++----- .../test/test_bwd_data_instance_traits.cpp | 7 +- .../test/test_bwd_weight_instance_traits.cpp | 10 ++- .../builder/test/test_fwd_instance_traits.cpp | 22 ++--- .../test/test_instance_traits_util.cpp | 18 ++-- .../grid/gridwise_gemm_pipeline_selector.hpp | 27 +----- include/ck/utility/blkgemmpipe_scheduler.hpp | 44 +--------- include/ck/utility/loop_scheduler.hpp | 28 +------ include/ck/utility/pipeline_enum.hpp | 40 +++++++++ include/ck/utility/scheduler_enum.hpp | 83 +++++++++++++++++++ 11 files changed, 197 insertions(+), 150 deletions(-) create mode 100644 include/ck/utility/pipeline_enum.hpp create mode 100644 include/ck/utility/scheduler_enum.hpp diff --git a/experimental/builder/include/ck_tile/builder/reflect/conv_traits.hpp b/experimental/builder/include/ck_tile/builder/reflect/conv_traits.hpp index 918fd6bdb6..e5a5638887 100644 --- a/experimental/builder/include/ck_tile/builder/reflect/conv_traits.hpp +++ b/experimental/builder/include/ck_tile/builder/reflect/conv_traits.hpp @@ -4,20 +4,20 @@ #pragma once #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include "ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp" +#include "ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp" +#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/utility/pipeline_enum.hpp" +#include "ck/utility/scheduler_enum.hpp" +#include "ck_tile/builder/conv_builder.hpp" +#include "ck_tile/builder/conv_signature_concepts.hpp" +#include "ck_tile/builder/reflect/instance_traits.hpp" +#include "ck_tile/builder/reflect/instance_traits_util.hpp" +#include "ck_tile/builder/types.hpp" #include "ck_tile/ops/epilogue.hpp" -#include +#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp" +#include "ck_tile/ops/grouped_convolution.hpp" namespace ck_tile::reflect::conv { diff --git a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_util.hpp b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_util.hpp index 64996f96f7..1055cbc038 100644 --- a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_util.hpp +++ b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_util.hpp @@ -8,28 +8,30 @@ #pragma once #include -#include -#include -#include -#include -#include -#include #include -#include +#include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "ck_tile/ops/epilogue.hpp" +#include +#include +#include +#include +#include +#include +#include "ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp" +#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/utility/data_type.hpp" +#include "ck/utility/pipeline_enum.hpp" +#include "ck/utility/scheduler_enum.hpp" +#include "ck/utility/sequence.hpp" +#include "ck_tile/core/container/tuple.hpp" +#include "ck_tile/core/numeric/bfloat16.hpp" +#include "ck_tile/core/numeric/float8.hpp" +#include "ck_tile/core/numeric/half.hpp" +#include "ck_tile/ops/common/tensor_layout.hpp" +#include "ck_tile/ops/gemm.hpp" +#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp" #include "ck_tile/ops/grouped_convolution/utils/convolution_specialization.hpp" #include "ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp" diff --git a/experimental/builder/test/test_bwd_data_instance_traits.cpp b/experimental/builder/test/test_bwd_data_instance_traits.cpp index 80e8ae8d98..f26b5d7caf 100644 --- a/experimental/builder/test/test_bwd_data_instance_traits.cpp +++ b/experimental/builder/test/test_bwd_data_instance_traits.cpp @@ -2,9 +2,10 @@ // SPDX-License-Identifier: MIT #include -#include -#include -#include +#include "ck/ck.hpp" +#include "ck_tile/builder/reflect/instance_traits.hpp" +#include "ck_tile/builder/reflect/instance_traits_tile_grouped_convolution_backward_data.hpp" +#include "ck_tile/ops/epilogue/cshuffle_epilogue.hpp" namespace { diff --git a/experimental/builder/test/test_bwd_weight_instance_traits.cpp b/experimental/builder/test/test_bwd_weight_instance_traits.cpp index 9b3cd169bb..c7c4e370e2 100644 --- a/experimental/builder/test/test_bwd_weight_instance_traits.cpp +++ b/experimental/builder/test/test_bwd_weight_instance_traits.cpp @@ -2,10 +2,12 @@ // SPDX-License-Identifier: MIT #include -#include -#include -#include -#include +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck_tile/builder/reflect/instance_traits.hpp" +#include "ck_tile/builder/reflect/instance_traits_device_grouped_conv_bwd_weight_xdl_cshuffle.hpp" +#include "ck_tile/builder/reflect/instance_traits_tile_grouped_convolution_backward_weight.hpp" +#include "ck_tile/ops/epilogue/cshuffle_epilogue.hpp" namespace { diff --git a/experimental/builder/test/test_fwd_instance_traits.cpp b/experimental/builder/test/test_fwd_instance_traits.cpp index 6a8f1f14e3..396533cef4 100644 --- a/experimental/builder/test/test_fwd_instance_traits.cpp +++ b/experimental/builder/test/test_fwd_instance_traits.cpp @@ -1,17 +1,19 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -#include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/utility/reduction_operator.hpp" +#include "ck_tile/builder/reflect/instance_traits.hpp" +#include "ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp" +#include "ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp" +#include "ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp" +#include "ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp" +#include "ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp" +#include "ck_tile/builder/reflect/instance_traits_tile_grouped_convolution_forward.hpp" +#include "ck_tile/ops/epilogue/cshuffle_epilogue.hpp" namespace { diff --git a/experimental/builder/test/test_instance_traits_util.cpp b/experimental/builder/test/test_instance_traits_util.cpp index 42810ace72..852174b805 100644 --- a/experimental/builder/test/test_instance_traits_util.cpp +++ b/experimental/builder/test/test_instance_traits_util.cpp @@ -1,16 +1,16 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -#include #include -#include -#include -#include -#include -#include -#include -#include -#include +#include +#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/utility/data_type.hpp" +#include "ck/utility/scheduler_enum.hpp" +#include "ck/utility/sequence.hpp" +#include "ck_tile/builder/reflect/instance_traits_util.hpp" namespace ck_tile::reflect::detail { namespace { diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp index 8d45b8fd74..751608299c 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp @@ -5,24 +5,16 @@ #if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC) #include -#include #endif +#include "ck/utility/pipeline_enum.hpp" +#include "ck/utility/loop_scheduler.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v2.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v4_direct_load.hpp" namespace ck { -enum struct PipelineVersion -{ - v1, - v2, - // v3 is only used in the Stream-K implementation. - v4, - weight_only, -}; - template Prefetch stages, number of loop is multiple of unroll stages - Empty, - // Unroll stages <= Prefetch stages, number of loop is multiple of unroll stages add - // prefetchstages - Full, -}; - enum SchedulerGroup : uint32_t { SCHED_GROUP_MFMA = 0x008, // Matrix FMA instructions diff --git a/include/ck/utility/loop_scheduler.hpp b/include/ck/utility/loop_scheduler.hpp index f186d0fea9..b3303e1138 100644 --- a/include/ck/utility/loop_scheduler.hpp +++ b/include/ck/utility/loop_scheduler.hpp @@ -3,40 +3,20 @@ #pragma once -#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC) -#include -#endif - #include "ck/utility/common_header.hpp" +#include "ck/utility/scheduler_enum.hpp" namespace ck { -enum struct LoopScheduler -{ - Default, - Interwave, -}; - +/// @brief Helper function to get default loop scheduler +/// @details Returns the default loop scheduler based on compile-time configuration. constexpr LoopScheduler make_default_loop_scheduler() { #if CK_EXPERIMENTAL_DEFAULT_TO_INTER_WAVE_SCHEDULING return LoopScheduler::Interwave; #else return LoopScheduler::Default; -#endif // if CK_EXPERIMENTAL_DEFAULT_TO_INTER_WAVE_SCHEDULING +#endif } } // namespace ck - -#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC) -inline std::ostream& operator<<(std::ostream& os, const ck::LoopScheduler& s) -{ - switch(s) - { - case ck::LoopScheduler::Default: os << "Default"; break; - case ck::LoopScheduler::Interwave: os << "Interwave"; break; - default: os << ""; - } - return os; -} -#endif diff --git a/include/ck/utility/pipeline_enum.hpp b/include/ck/utility/pipeline_enum.hpp new file mode 100644 index 0000000000..4421386f59 --- /dev/null +++ b/include/ck/utility/pipeline_enum.hpp @@ -0,0 +1,40 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#pragma once + +#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC) +#include +#endif + +namespace ck { + +/// @brief Pipeline version enumeration for GEMM kernels +/// @details Defines different pipeline strategies for data movement and computation overlap +/// in GEMM kernels. This is a lightweight header containing only the enum definition, +/// extracted from gridwise_gemm_pipeline_selector.hpp to minimize dependencies. +enum struct PipelineVersion +{ + v1, ///< Version 1 pipeline + v2, ///< Version 2 pipeline + // v3 is only used in the Stream-K implementation. + v4, ///< Version 4 pipeline + weight_only, ///< Weight-only specialized pipeline +}; + +} // namespace ck + +#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC) +inline std::ostream& operator<<(std::ostream& os, const ck::PipelineVersion& p) +{ + switch(p) + { + case ck::PipelineVersion::v1: os << "PipelineVersion::v1"; break; + case ck::PipelineVersion::v2: os << "PipelineVersion::v2"; break; + case ck::PipelineVersion::v4: os << "PipelineVersion::v4"; break; + case ck::PipelineVersion::weight_only: os << "PipelineVersion::weight_only"; break; + default: os << ""; + } + return os; +} +#endif diff --git a/include/ck/utility/scheduler_enum.hpp b/include/ck/utility/scheduler_enum.hpp new file mode 100644 index 0000000000..0c4bfabaf3 --- /dev/null +++ b/include/ck/utility/scheduler_enum.hpp @@ -0,0 +1,83 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#pragma once + +#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC) +#include +#endif + +namespace ck { + +/// @brief Block GEMM pipeline version enumeration +/// @details Defines different block GEMM pipeline strategies. +/// This is a lightweight header containing only enum definitions, +/// extracted from blkgemmpipe_scheduler.hpp to minimize dependencies. +enum struct BlockGemmPipelineVersion +{ + // For GEMM + v1, ///< Naive pipeline + v2, ///< Memory-optimized pipeline + v3, ///< Compute-optimized pipeline + v4, ///< Compute-optimized with double LDS buffer + v5, ///< Compute-optimized with double global prefetch register buffer + + // For GEMM with preshuffled weight + // v1, single lds buffer + // v2, double lds buffer +}; + +/// @brief Block GEMM pipeline scheduler enumeration +/// @details Defines scheduling strategies for block GEMM pipelines. +enum struct BlockGemmPipelineScheduler +{ + Intrawave, ///< Schedule within a single wavefront + Interwave, ///< Schedule across multiple wavefronts +}; + +/// @brief Loop scheduler enumeration +/// @details Defines scheduling strategies for computational loops. +enum struct LoopScheduler +{ + Default, ///< Default scheduling strategy + Interwave, ///< Cross-wavefront scheduling +}; + +/// @brief Tail number enumeration for pipeline buffering +/// @details Defines the number of tail iterations in pipelined loops. +enum struct TailNumber +{ + // Single / Double buffer pipeline + Odd, ///< Odd number of iterations + Even, ///< Even number of iterations + + // Long prefetch pipeline, up to 8 + One, ///< One tail iteration + Two, ///< Two tail iterations + Three, ///< Three tail iterations + Four, ///< Four tail iterations + Five, ///< Five tail iterations + Six, ///< Six tail iterations + Seven, ///< Seven tail iterations + + // Unroll stages > Prefetch stages, number of loop is multiple of unroll stages + Empty, ///< No tail iterations + // Unroll stages <= Prefetch stages, number of loop is multiple of unroll stages add + // prefetchstages + Full, ///< Full tail iterations +}; + +} // namespace ck + +#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC) +inline std::ostream& operator<<(std::ostream& os, const ck::LoopScheduler& s) +{ + switch(s) + { + case ck::LoopScheduler::Default: os << "Default"; break; + case ck::LoopScheduler::Interwave: os << "Interwave"; break; + default: os << ""; + } + return os; +} +#endif