diff --git a/CMakeLists.txt b/CMakeLists.txt index 4ac8550082..306e6ca649 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -43,6 +43,7 @@ message(STATUS "Build with HIP ${hip_VERSION}") message("HALF_INCLUDE_DIR: ${HALF_INCLUDE_DIR}") # CMAKE_CXX_FLAGS +SET(BUILD_DEV ON CACHE BOOL "BUILD_DEV") if(BUILD_DEV) string(APPEND CMAKE_CXX_FLAGS " -Werror -Weverything") endif() diff --git a/composable_kernel/include/tensor_description/multi_index_transform.hpp b/composable_kernel/include/tensor_description/multi_index_transform.hpp index fa5d2246d7..a33b9aee8d 100644 --- a/composable_kernel/include/tensor_description/multi_index_transform.hpp +++ b/composable_kernel/include/tensor_description/multi_index_transform.hpp @@ -377,7 +377,7 @@ struct RightPad // at compile-time template ::type = false> + typename enable_if::type = false> struct Embed { static constexpr index_t NDimUp = UpLengths::Size(); diff --git a/composable_kernel/include/tensor_description/multi_index_transform_helper.hpp b/composable_kernel/include/tensor_description/multi_index_transform_helper.hpp index abb48c450b..6d4e01888b 100644 --- a/composable_kernel/include/tensor_description/multi_index_transform_helper.hpp +++ b/composable_kernel/include/tensor_description/multi_index_transform_helper.hpp @@ -42,7 +42,7 @@ __host__ __device__ constexpr auto make_right_pad_transform( template ::type = false> + typename enable_if::type = false> __host__ __device__ constexpr auto make_embed_transform(const UpLengths& up_lengths, const Coefficients& coefficients) { diff --git a/composable_kernel/include/tensor_description/tensor_adaptor.hpp b/composable_kernel/include/tensor_description/tensor_adaptor.hpp index 2508abc6b9..f684ce5e0f 100644 --- a/composable_kernel/include/tensor_description/tensor_adaptor.hpp +++ b/composable_kernel/include/tensor_description/tensor_adaptor.hpp @@ -454,9 +454,7 @@ __host__ __device__ constexpr auto make_single_stage_tensor_adaptor(const Transf remove_cv_t>{transforms}; } -template = 2, bool>::type = false> +template = 2, bool>::type = false> __host__ __device__ constexpr auto chain_tensor_adaptors(const X& x, const Xs&... xs) { return chain_tensor_adaptors(x, chain_tensor_adaptors(xs...)); diff --git a/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp b/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp index 93f9dac64f..cf329f06a5 100644 --- a/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp +++ b/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp @@ -37,7 +37,7 @@ __host__ __device__ constexpr auto calculate_element_space_size_impl(const Lengt template ::type = false> + typename enable_if::type = false> __host__ __device__ constexpr auto make_naive_tensor_descriptor_v2(const Tuple& lengths, const Tuple& strides) { diff --git a/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r2.hpp b/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r2.hpp index 796e6387da..35ff66a2b0 100644 --- a/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r2.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r2.hpp @@ -22,24 +22,24 @@ namespace ck { // 2. CThreadBuffer is StaticBuffer // Also assume: // M0 = N0 = 2. It will do 2x2 pipelined read and fma (ABBA optimization) -template ::type = false> +template < + index_t BlockSize, + typename FloatA, + typename FloatB, + typename FloatC, + typename AKMBlockDesc, + typename BKNBlockDesc, + index_t M1PerThreadM11, + index_t N1PerThreadN11, + index_t KPerThread, + index_t M1N1ThreadClusterM100, + index_t M1N1ThreadClusterN100, + index_t M1N1ThreadClusterM101, + index_t M1N1ThreadClusterN101, + index_t AThreadCopyScalarPerVector_M11, + index_t BThreadCopyScalarPerVector_N11, + typename enable_if::type = false> struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2 { using AIndex = MultiIndex<3>; diff --git a/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r3.hpp b/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r3.hpp index ace940d4f3..26ca0bf111 100644 --- a/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r3.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v2r3.hpp @@ -38,9 +38,9 @@ template index_t AThreadCopyScalarPerVector_BM11, index_t BThreadCopyScalarPerVector_BN11, - typename std::enable_if::type = false> + typename enable_if::type = false> struct BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2 { using AIndex = MultiIndex<3>; diff --git a/composable_kernel/include/tensor_operation/threadwise_contraction_dlops.hpp b/composable_kernel/include/tensor_operation/threadwise_contraction_dlops.hpp index ca3aca3015..a925a5cd68 100644 --- a/composable_kernel/include/tensor_operation/threadwise_contraction_dlops.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_contraction_dlops.hpp @@ -21,10 +21,10 @@ template ::type = false> + typename enable_if::type = false> struct ThreadwiseGemmDlops_km0m1_kn0n1_m0m1n0n1 { __device__ constexpr ThreadwiseGemmDlops_km0m1_kn0n1_m0m1n0n1() @@ -123,10 +123,10 @@ template ::type = false> + typename enable_if::type = false> struct ThreadwiseContractionDlops_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_TN0_TN1 { __device__ constexpr ThreadwiseContractionDlops_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_TN0_TN1() diff --git a/composable_kernel/include/tensor_operation/threadwise_gemm_dlops_v3.hpp b/composable_kernel/include/tensor_operation/threadwise_gemm_dlops_v3.hpp index f9d8ac05b6..015ad675fb 100644 --- a/composable_kernel/include/tensor_operation/threadwise_gemm_dlops_v3.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_gemm_dlops_v3.hpp @@ -19,9 +19,9 @@ template ::type = false> + typename enable_if::type = false> struct ThreadwiseGemmDlops_km_kn_mn_v3 { template ::type = false> + typename enable_if::type = false> struct ThreadwiseTensorSliceSet_v1 { static constexpr index_t nDim = SliceLengths::Size(); diff --git a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp index 82e46984e2..0071accf7f 100644 --- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp @@ -57,7 +57,7 @@ template ::type = false> + typename enable_if::type = false> struct ThreadwiseTensorSliceTransfer_v1r3 { static constexpr index_t nDim = SliceLengths::Size(); @@ -373,7 +373,7 @@ template ::type = false> + typename enable_if::type = false> struct ThreadwiseTensorSliceTransfer_v2 { static constexpr index_t nDim = SliceLengths::Size(); @@ -1261,18 +1261,17 @@ struct ThreadwiseTensorSliceTransfer_v3 // 3. DstOriginIdx is known at compile-time // 4. use direct address calculation // 3. vector access on src -template < - typename SrcData, - typename DstData, - typename SrcDesc, - typename DstDesc, - typename SliceLengths, - typename DimAccessOrder, - index_t SrcVectorDim, - index_t SrcScalarPerVector, - index_t SrcScalarStrideInVector, - typename std::enable_if::type = false> +template ::type = false> struct ThreadwiseTensorSliceTransfer_v4 { static constexpr index_t nDim = SliceLengths::Size(); diff --git a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v2.hpp b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v2.hpp index 6d96aa1253..f069540343 100644 --- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v2.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v2.hpp @@ -621,17 +621,16 @@ struct ThreadwiseTensorSliceTransfer_v3r1 // 3. DstOriginIdx is known at compile-time // 4. use direct address calculation // 3. vector access on src -template < - typename SrcData, - typename DstData, - typename SrcDesc, - typename DstDesc, - typename SliceLengths, - typename DimAccessOrder, - typename SrcVectorTensorLengths, - typename SrcVectorTensorContiguousDimOrder, - typename std::enable_if::type = false> +template ::type = false> struct ThreadwiseTensorSliceTransfer_v4r1 { static constexpr auto I0 = Number<0>{}; diff --git a/composable_kernel/include/utility/c_style_pointer_cast.hpp b/composable_kernel/include/utility/c_style_pointer_cast.hpp index 29cd3d07ca..8acf5790c6 100644 --- a/composable_kernel/include/utility/c_style_pointer_cast.hpp +++ b/composable_kernel/include/utility/c_style_pointer_cast.hpp @@ -2,12 +2,13 @@ #define CK_C_STYLE_POINTER_CAST_HPP #include "type.hpp" +#include "enable_if.hpp" namespace ck { template && is_pointer_v, bool>::type = false> + typename enable_if && is_pointer_v, bool>::type = false> __host__ __device__ PY c_style_pointer_cast(PX p_x) { #pragma clang diagnostic push diff --git a/composable_kernel/include/utility/common_header.hpp b/composable_kernel/include/utility/common_header.hpp index ba20248028..85c02a1b99 100644 --- a/composable_kernel/include/utility/common_header.hpp +++ b/composable_kernel/include/utility/common_header.hpp @@ -14,6 +14,7 @@ #include "functional2.hpp" #include "functional3.hpp" #include "functional4.hpp" +#include "enable_if.hpp" #include "integral_constant.hpp" #include "math.hpp" #include "number.hpp" diff --git a/composable_kernel/include/utility/dynamic_buffer.hpp b/composable_kernel/include/utility/dynamic_buffer.hpp index 6f54f7317c..4d583e3ce7 100644 --- a/composable_kernel/include/utility/dynamic_buffer.hpp +++ b/composable_kernel/include/utility/dynamic_buffer.hpp @@ -3,6 +3,7 @@ #include "amd_buffer_addressing.hpp" #include "c_style_pointer_cast.hpp" +#include "enable_if.hpp" namespace ck { @@ -38,7 +39,7 @@ struct DynamicBuffer } template >>::type, typename scalar_type>>::type>::value, bool>::type = false> @@ -93,7 +94,7 @@ struct DynamicBuffer } template >>::type, typename scalar_type>>::type>::value, bool>::type = false> diff --git a/composable_kernel/include/utility/enable_if.hpp b/composable_kernel/include/utility/enable_if.hpp new file mode 100644 index 0000000000..501e1bfc1c --- /dev/null +++ b/composable_kernel/include/utility/enable_if.hpp @@ -0,0 +1,13 @@ +#ifndef CK_ENABLE_IF_HPP +#define CK_ENABLE_IF_HPP + +namespace ck { + +template +using enable_if = std::enable_if; + +template +using enable_if_t = typename std::enable_if::type; + +} // namespace ck +#endif diff --git a/composable_kernel/include/utility/math.hpp b/composable_kernel/include/utility/math.hpp index e451059647..bcb25a2941 100644 --- a/composable_kernel/include/utility/math.hpp +++ b/composable_kernel/include/utility/math.hpp @@ -5,6 +5,7 @@ #include "integral_constant.hpp" #include "number.hpp" #include "type.hpp" +#include "enable_if.hpp" namespace ck { namespace math { @@ -184,9 +185,7 @@ __host__ __device__ constexpr auto gcd(Number, Number) return Number{}; } -template = 2, bool>::type = false> +template = 2, bool>::type = false> __host__ __device__ constexpr auto gcd(X x, Ys... ys) { return gcd(x, gcd(ys...)); @@ -199,9 +198,7 @@ __host__ __device__ constexpr auto lcm(X x, Y y) return (x * y) / gcd(x, y); } -template = 2, bool>::type = false> +template = 2, bool>::type = false> __host__ __device__ constexpr auto lcm(X x, Ys... ys) { return lcm(x, lcm(ys...)); diff --git a/composable_kernel/include/utility/tuple.hpp b/composable_kernel/include/utility/tuple.hpp index 15b73011b4..ee96a8b435 100644 --- a/composable_kernel/include/utility/tuple.hpp +++ b/composable_kernel/include/utility/tuple.hpp @@ -4,6 +4,7 @@ #include "integral_constant.hpp" #include "sequence.hpp" #include "type.hpp" +#include "enable_if.hpp" namespace ck { @@ -20,10 +21,9 @@ struct TupleElement { __host__ __device__ constexpr TupleElement() = default; - template < - typename T, - typename std::enable_if>, TupleElement>::value, - bool>::type = false> + template >, TupleElement>::value, + bool>::type = false> __host__ __device__ constexpr TupleElement(T&& v) : mData(std::forward(v)) { } @@ -58,17 +58,16 @@ struct TupleImpl, Xs...> : TupleElement, Xs> { __host__ __device__ constexpr TupleImpl() = default; - template < - typename Y, - typename std::enable_if>, TupleImpl>::value, - bool>::type = false> + template >, TupleImpl>::value, + bool>::type = false> __host__ __device__ constexpr TupleImpl(Y&& y) : TupleElement, Xs>(std::forward(y))... { } - template = 2, bool>::type = false> + template = 2, bool>::type = false> __host__ __device__ constexpr TupleImpl(Ys&&... ys) : TupleElement, Xs>(std::forward(ys))... { @@ -102,16 +101,16 @@ struct Tuple : detail::TupleImpl>, Tuple>::value, - bool>::type = false> + typename enable_if>, Tuple>::value, + bool>::type = false> __host__ __device__ constexpr Tuple(Y&& y) : base(std::forward(y)) { } template = 2, - bool>::type = false> + typename enable_if= 2, bool>::type = + false> __host__ __device__ constexpr Tuple(Ys&&... ys) : base(std::forward(ys)...) { } diff --git a/composable_kernel/include/utility/type.hpp b/composable_kernel/include/utility/type.hpp index 12ed435658..b7902ad496 100644 --- a/composable_kernel/include/utility/type.hpp +++ b/composable_kernel/include/utility/type.hpp @@ -2,6 +2,7 @@ #define CK_TYPE_HPP #include "integral_constant.hpp" +#include "enable_if.hpp" namespace ck { @@ -39,9 +40,7 @@ struct is_known_at_compile_time> static constexpr bool value = true; }; -template ::type = false> +template ::type = false> __host__ __device__ constexpr Y as_type(X x) { union AsType