diff --git a/experimental/builder/include/ck_tile/builder/reflect/conv_traits.hpp b/experimental/builder/include/ck_tile/builder/reflect/conv_traits.hpp index 918fd6bdb6..e5a5638887 100644 --- a/experimental/builder/include/ck_tile/builder/reflect/conv_traits.hpp +++ b/experimental/builder/include/ck_tile/builder/reflect/conv_traits.hpp @@ -4,20 +4,20 @@ #pragma once #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include "ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp" +#include "ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp" +#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/utility/pipeline_enum.hpp" +#include "ck/utility/scheduler_enum.hpp" +#include "ck_tile/builder/conv_builder.hpp" +#include "ck_tile/builder/conv_signature_concepts.hpp" +#include "ck_tile/builder/reflect/instance_traits.hpp" +#include "ck_tile/builder/reflect/instance_traits_util.hpp" +#include "ck_tile/builder/types.hpp" #include "ck_tile/ops/epilogue.hpp" -#include +#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp" +#include "ck_tile/ops/grouped_convolution.hpp" namespace ck_tile::reflect::conv { diff --git a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_util.hpp b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_util.hpp index 64996f96f7..1055cbc038 100644 --- a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_util.hpp +++ b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_util.hpp @@ -8,28 +8,30 @@ #pragma once #include -#include -#include -#include -#include -#include -#include #include -#include +#include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "ck_tile/ops/epilogue.hpp" +#include +#include +#include +#include +#include +#include +#include "ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp" +#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/utility/data_type.hpp" +#include "ck/utility/pipeline_enum.hpp" +#include "ck/utility/scheduler_enum.hpp" +#include "ck/utility/sequence.hpp" +#include "ck_tile/core/container/tuple.hpp" +#include "ck_tile/core/numeric/bfloat16.hpp" +#include "ck_tile/core/numeric/float8.hpp" +#include "ck_tile/core/numeric/half.hpp" +#include "ck_tile/ops/common/tensor_layout.hpp" +#include "ck_tile/ops/gemm.hpp" +#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp" #include "ck_tile/ops/grouped_convolution/utils/convolution_specialization.hpp" #include "ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp" diff --git a/experimental/builder/test/test_bwd_data_instance_traits.cpp b/experimental/builder/test/test_bwd_data_instance_traits.cpp index 80e8ae8d98..f26b5d7caf 100644 --- a/experimental/builder/test/test_bwd_data_instance_traits.cpp +++ b/experimental/builder/test/test_bwd_data_instance_traits.cpp @@ -2,9 +2,10 @@ // SPDX-License-Identifier: MIT #include -#include -#include -#include +#include "ck/ck.hpp" +#include "ck_tile/builder/reflect/instance_traits.hpp" +#include "ck_tile/builder/reflect/instance_traits_tile_grouped_convolution_backward_data.hpp" +#include "ck_tile/ops/epilogue/cshuffle_epilogue.hpp" namespace { diff --git a/experimental/builder/test/test_bwd_weight_instance_traits.cpp b/experimental/builder/test/test_bwd_weight_instance_traits.cpp index 9b3cd169bb..c7c4e370e2 100644 --- a/experimental/builder/test/test_bwd_weight_instance_traits.cpp +++ b/experimental/builder/test/test_bwd_weight_instance_traits.cpp @@ -2,10 +2,12 @@ // SPDX-License-Identifier: MIT #include -#include -#include -#include -#include +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck_tile/builder/reflect/instance_traits.hpp" +#include "ck_tile/builder/reflect/instance_traits_device_grouped_conv_bwd_weight_xdl_cshuffle.hpp" +#include "ck_tile/builder/reflect/instance_traits_tile_grouped_convolution_backward_weight.hpp" +#include "ck_tile/ops/epilogue/cshuffle_epilogue.hpp" namespace { diff --git a/experimental/builder/test/test_fwd_instance_traits.cpp b/experimental/builder/test/test_fwd_instance_traits.cpp index 6a8f1f14e3..396533cef4 100644 --- a/experimental/builder/test/test_fwd_instance_traits.cpp +++ b/experimental/builder/test/test_fwd_instance_traits.cpp @@ -1,17 +1,19 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -#include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/utility/reduction_operator.hpp" +#include "ck_tile/builder/reflect/instance_traits.hpp" +#include "ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp" +#include "ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp" +#include "ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp" +#include "ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp" +#include "ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp" +#include "ck_tile/builder/reflect/instance_traits_tile_grouped_convolution_forward.hpp" +#include "ck_tile/ops/epilogue/cshuffle_epilogue.hpp" namespace { diff --git a/experimental/builder/test/test_instance_traits_util.cpp b/experimental/builder/test/test_instance_traits_util.cpp index 42810ace72..852174b805 100644 --- a/experimental/builder/test/test_instance_traits_util.cpp +++ b/experimental/builder/test/test_instance_traits_util.cpp @@ -1,16 +1,16 @@ // Copyright (c) Advanced Micro Devices, Inc., or its affiliates. // SPDX-License-Identifier: MIT -#include #include -#include -#include -#include -#include -#include -#include -#include -#include +#include +#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/utility/data_type.hpp" +#include "ck/utility/scheduler_enum.hpp" +#include "ck/utility/sequence.hpp" +#include "ck_tile/builder/reflect/instance_traits_util.hpp" namespace ck_tile::reflect::detail { namespace { diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp index 8d45b8fd74..751608299c 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp @@ -5,24 +5,16 @@ #if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC) #include -#include #endif +#include "ck/utility/pipeline_enum.hpp" +#include "ck/utility/loop_scheduler.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v2.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v4_direct_load.hpp" namespace ck { -enum struct PipelineVersion -{ - v1, - v2, - // v3 is only used in the Stream-K implementation. - v4, - weight_only, -}; - template Prefetch stages, number of loop is multiple of unroll stages - Empty, - // Unroll stages <= Prefetch stages, number of loop is multiple of unroll stages add - // prefetchstages - Full, -}; - enum SchedulerGroup : uint32_t { SCHED_GROUP_MFMA = 0x008, // Matrix FMA instructions diff --git a/include/ck/utility/loop_scheduler.hpp b/include/ck/utility/loop_scheduler.hpp index f186d0fea9..b3303e1138 100644 --- a/include/ck/utility/loop_scheduler.hpp +++ b/include/ck/utility/loop_scheduler.hpp @@ -3,40 +3,20 @@ #pragma once -#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC) -#include -#endif - #include "ck/utility/common_header.hpp" +#include "ck/utility/scheduler_enum.hpp" namespace ck { -enum struct LoopScheduler -{ - Default, - Interwave, -}; - +/// @brief Helper function to get default loop scheduler +/// @details Returns the default loop scheduler based on compile-time configuration. constexpr LoopScheduler make_default_loop_scheduler() { #if CK_EXPERIMENTAL_DEFAULT_TO_INTER_WAVE_SCHEDULING return LoopScheduler::Interwave; #else return LoopScheduler::Default; -#endif // if CK_EXPERIMENTAL_DEFAULT_TO_INTER_WAVE_SCHEDULING +#endif } } // namespace ck - -#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC) -inline std::ostream& operator<<(std::ostream& os, const ck::LoopScheduler& s) -{ - switch(s) - { - case ck::LoopScheduler::Default: os << "Default"; break; - case ck::LoopScheduler::Interwave: os << "Interwave"; break; - default: os << ""; - } - return os; -} -#endif diff --git a/include/ck/utility/pipeline_enum.hpp b/include/ck/utility/pipeline_enum.hpp new file mode 100644 index 0000000000..4421386f59 --- /dev/null +++ b/include/ck/utility/pipeline_enum.hpp @@ -0,0 +1,40 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#pragma once + +#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC) +#include +#endif + +namespace ck { + +/// @brief Pipeline version enumeration for GEMM kernels +/// @details Defines different pipeline strategies for data movement and computation overlap +/// in GEMM kernels. This is a lightweight header containing only the enum definition, +/// extracted from gridwise_gemm_pipeline_selector.hpp to minimize dependencies. +enum struct PipelineVersion +{ + v1, ///< Version 1 pipeline + v2, ///< Version 2 pipeline + // v3 is only used in the Stream-K implementation. + v4, ///< Version 4 pipeline + weight_only, ///< Weight-only specialized pipeline +}; + +} // namespace ck + +#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC) +inline std::ostream& operator<<(std::ostream& os, const ck::PipelineVersion& p) +{ + switch(p) + { + case ck::PipelineVersion::v1: os << "PipelineVersion::v1"; break; + case ck::PipelineVersion::v2: os << "PipelineVersion::v2"; break; + case ck::PipelineVersion::v4: os << "PipelineVersion::v4"; break; + case ck::PipelineVersion::weight_only: os << "PipelineVersion::weight_only"; break; + default: os << ""; + } + return os; +} +#endif diff --git a/include/ck/utility/scheduler_enum.hpp b/include/ck/utility/scheduler_enum.hpp new file mode 100644 index 0000000000..0c4bfabaf3 --- /dev/null +++ b/include/ck/utility/scheduler_enum.hpp @@ -0,0 +1,83 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#pragma once + +#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC) +#include +#endif + +namespace ck { + +/// @brief Block GEMM pipeline version enumeration +/// @details Defines different block GEMM pipeline strategies. +/// This is a lightweight header containing only enum definitions, +/// extracted from blkgemmpipe_scheduler.hpp to minimize dependencies. +enum struct BlockGemmPipelineVersion +{ + // For GEMM + v1, ///< Naive pipeline + v2, ///< Memory-optimized pipeline + v3, ///< Compute-optimized pipeline + v4, ///< Compute-optimized with double LDS buffer + v5, ///< Compute-optimized with double global prefetch register buffer + + // For GEMM with preshuffled weight + // v1, single lds buffer + // v2, double lds buffer +}; + +/// @brief Block GEMM pipeline scheduler enumeration +/// @details Defines scheduling strategies for block GEMM pipelines. +enum struct BlockGemmPipelineScheduler +{ + Intrawave, ///< Schedule within a single wavefront + Interwave, ///< Schedule across multiple wavefronts +}; + +/// @brief Loop scheduler enumeration +/// @details Defines scheduling strategies for computational loops. +enum struct LoopScheduler +{ + Default, ///< Default scheduling strategy + Interwave, ///< Cross-wavefront scheduling +}; + +/// @brief Tail number enumeration for pipeline buffering +/// @details Defines the number of tail iterations in pipelined loops. +enum struct TailNumber +{ + // Single / Double buffer pipeline + Odd, ///< Odd number of iterations + Even, ///< Even number of iterations + + // Long prefetch pipeline, up to 8 + One, ///< One tail iteration + Two, ///< Two tail iterations + Three, ///< Three tail iterations + Four, ///< Four tail iterations + Five, ///< Five tail iterations + Six, ///< Six tail iterations + Seven, ///< Seven tail iterations + + // Unroll stages > Prefetch stages, number of loop is multiple of unroll stages + Empty, ///< No tail iterations + // Unroll stages <= Prefetch stages, number of loop is multiple of unroll stages add + // prefetchstages + Full, ///< Full tail iterations +}; + +} // namespace ck + +#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC) +inline std::ostream& operator<<(std::ostream& os, const ck::LoopScheduler& s) +{ + switch(s) + { + case ck::LoopScheduler::Default: os << "Default"; break; + case ck::LoopScheduler::Interwave: os << "Interwave"; break; + default: os << ""; + } + return os; +} +#endif