diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp index 9666e3532d..7fe29324a4 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp @@ -2,7 +2,6 @@ #define CK_GRIDWISE_CONVOLUTION_IMPLICIT_GEMM_V4R1_NCHW_KCYX_NKHW_PADDED_LDS_DOUBLE_BUFFER_HPP #include "common_header.hpp" -#include "ConstantTensorDescriptor.hpp" #include "tensor_descriptor.hpp" #include "tensor_descriptor_helper.hpp" #include "ConstantMatrixDescriptor.hpp" @@ -73,12 +72,9 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf constexpr auto True = integral_constant{}; - constexpr auto in_n_c_hi_wi_global_desc = - make_native_tensor_descriptor(InGlobalDesc::GetLengths(), InGlobalDesc::GetStrides()); - constexpr auto wei_k_c_y_x_global_desc = - make_native_tensor_descriptor(WeiGlobalDesc::GetLengths(), WeiGlobalDesc::GetStrides()); - constexpr auto out_n_k_ho_wo_global_desc = - make_native_tensor_descriptor(OutGlobalDesc::GetLengths(), OutGlobalDesc::GetStrides()); + constexpr auto in_n_c_hi_wi_global_desc = InGlobalDesc{}; + constexpr auto wei_k_c_y_x_global_desc = WeiGlobalDesc{}; + constexpr auto out_n_k_ho_wo_global_desc = OutGlobalDesc{}; constexpr index_t N = in_n_c_hi_wi_global_desc.GetLength(I0); constexpr index_t C = in_n_c_hi_wi_global_desc.GetLength(I1); @@ -119,11 +115,13 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf constexpr index_t KBlockWork = K / KPerBlock; constexpr index_t BBlockWork = B / BPerBlock; - constexpr auto block_work_desc = - make_ConstantTensorDescriptor_packed(Sequence{}); + constexpr auto block_work_desc = transform_tensor_descriptor( + make_native_tensor_descriptor_packed(Sequence{}), + make_tuple(Merge>{}), + make_tuple(Sequence<0, 1>{}), + make_tuple(Sequence<0>{})); - const auto block_work_multi_id = - block_work_desc.GetMultiIndexFrom1dIndex(get_block_1d_id()); + const auto block_work_multi_id = block_work_desc.CalculateLowerIndex(get_block_1d_id()); const index_t k_block_data_on_global = block_work_multi_id[0] * KPerBlock; const index_t b_block_data_on_global = block_work_multi_id[1] * BPerBlock; @@ -139,7 +137,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf constexpr auto in_n0_n1_n2_c_y_ho_x_wo_global_desc = transform_tensor_descriptor( in_n_c_hip_wip_global_desc, - make_tuple(Unmerge>{}, + make_tuple(UnMerge>{}, PassThrough{}, Embed, Sequence>{}, Embed, Sequence>{}), @@ -185,21 +183,13 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf InBlockCopyDstDataPerWrite_N2>( {0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0}); -#if 0 // weight tensor // tensor descriptor in device memory, src of blockwise copy - constexpr auto wei_e_k_global_desc = - transform_tensor_descriptor(wei_k_c_y_x_global_desc, - make_tuple(Merge>{}, PassThrough{}), - make_tuple(Sequence<1, 2, 3>{}, Sequence<0>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); -#else // hack - constexpr auto wei_e_k_global_desc_old = - WeiGlobalDesc::Unfold(I1, I3).ReorderGivenNew2Old(Sequence<1, 0>{}); + constexpr auto wei_k_e_global_desc = + unfold_tensor_descriptor(wei_k_c_y_x_global_desc, I1, I3); - constexpr auto wei_e_k_global_desc = make_native_tensor_descriptor( - wei_e_k_global_desc_old.GetLengths(), wei_e_k_global_desc_old.GetStrides()); -#endif + constexpr auto wei_e_k_global_desc = + reorder_tensor_descriptor_given_upper2lower(wei_k_e_global_desc, Sequence<1, 0>{}); // tensor descriptor in LDS, dst of blockwise copy // be careful of LDS alignment @@ -340,10 +330,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf blockwise_gemm.Run(p_wei_block_now, p_in_block_now, p_out_thread); // LDS double buffer: store next data to LDS - blockwise_in_copy.template RunStoreRegisterBuffer( - p_in_register_buffer, p_in_block_next); - blockwise_wei_copy.template RunStoreRegisterBuffer( - p_wei_register_buffer, p_wei_block_next); + blockwise_in_copy.RunStoreRegisterBuffer(p_in_register_buffer, p_in_block_next); + blockwise_wei_copy.RunStoreRegisterBuffer(p_wei_register_buffer, p_wei_block_next); } } @@ -368,10 +356,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf blockwise_gemm.Run(p_wei_block_double, p_in_block_double, p_out_thread); // LDS double buffer: store next data to LDS - blockwise_in_copy.template RunStoreRegisterBuffer( - p_in_register_buffer, p_in_block_double + in_block_space); - blockwise_wei_copy.template RunStoreRegisterBuffer( - p_wei_register_buffer, p_wei_block_double + wei_block_space); + blockwise_in_copy.RunStoreRegisterBuffer(p_in_register_buffer, + p_in_block_double + in_block_space); + blockwise_wei_copy.RunStoreRegisterBuffer(p_wei_register_buffer, + p_wei_block_double + wei_block_space); // odd iteration __syncthreads(); @@ -393,12 +381,14 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf Sequence{}); // output memory layout descriptor in device memory - constexpr auto out_n0_n1_n2_k0_k1_ho_wo_global_desc_old = - OutGlobalDesc::Fold(I1, Number{}).Fold(I0, Number{}, Number{}); - - constexpr auto out_n0_n1_n2_k0_k1_ho_wo_global_desc = make_native_tensor_descriptor( - out_n0_n1_n2_k0_k1_ho_wo_global_desc_old.GetLengths(), - out_n0_n1_n2_k0_k1_ho_wo_global_desc_old.GetStrides()); + constexpr auto out_n0_n1_n2_k0_k1_ho_wo_global_desc = transform_tensor_descriptor( + out_n_k_ho_wo_global_desc, + make_tuple(UnMerge>{}, + UnMerge>{}, + PassThrough{}, + PassThrough{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), + make_tuple(Sequence<0, 1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}, Sequence<6>{})); // output merged global tensor descriptor, dst of threadwise copy constexpr auto out_k0_k1_n1_b_n2_global_desc = transform_tensor_descriptor( diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp index 25169693b2..073cd2cc27 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp @@ -384,7 +384,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded_lds_double_buf constexpr auto out_k0_k1_b0_b1_global_desc = transform_tensor_descriptor( out_k_b_global_desc, - make_tuple(Unmerge>{}, Unmerge>{}), + make_tuple(UnMerge>{}, UnMerge>{}), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{})); diff --git a/composable_kernel/include/tensor_description/multi_index_transform.hpp b/composable_kernel/include/tensor_description/multi_index_transform.hpp index 6acbe55233..cd70d05f2a 100644 --- a/composable_kernel/include/tensor_description/multi_index_transform.hpp +++ b/composable_kernel/include/tensor_description/multi_index_transform.hpp @@ -252,7 +252,6 @@ struct Merge }); // highest dimension, no out-of-bound check - if(borrow) { --idx_low_new(0); @@ -273,7 +272,7 @@ struct Merge // UpperLengths: Sequence<...> template -struct Unmerge +struct UnMerge { static constexpr index_t nDimLow = 1; static constexpr index_t nDimUp = UpperLengths::Size(); diff --git a/composable_kernel/include/tensor_description/tensor_coordinate.hpp b/composable_kernel/include/tensor_description/tensor_coordinate.hpp index 5b4805e9ee..223d0d5bed 100644 --- a/composable_kernel/include/tensor_description/tensor_coordinate.hpp +++ b/composable_kernel/include/tensor_description/tensor_coordinate.hpp @@ -325,14 +325,14 @@ struct TensorCoordinate private: template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(ConstantTensorDescriptor) + MakeDummyTensorCoordinate(ConstantTensorDescriptor) { return NormalTensorCoordinate>(); } template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor) + MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor) { return MergedTensorCoordinate>(); } diff --git a/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp b/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp index d6cc0cd46d..cbb9a703df 100644 --- a/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp +++ b/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp @@ -81,7 +81,7 @@ struct NativeTensorCoordinate __host__ __device__ static constexpr bool IsUpperIndexMappedToValidOffset() { return true; } private: - // mIndex may be saved and update, however, the value of some (or all) of its entries may + // mIndex may be saved and updated, however, the value of some (or all) of its entries may // never be used. Compiler should be able to remove these entries as well as its calculation // as dead code. // TODO: make sure compiler indeed remove these dead code @@ -178,7 +178,8 @@ struct TransformedTensorCoordinate } private: - // mIndexUp may be calculated and update, however, the value of some (or all) of its entries may + // mIndexUp may be calculated and updated, however, the value of some (or all) of its entries + // may // never be used. Compiler should be able to remove these entries as well as its calculation // as dead code. // TODO: make sure compiler indeed remove these dead code @@ -192,7 +193,7 @@ struct TensorCoordinate_v2 private: template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(NativeTensorDescriptor) + MakeDummyTensorCoordinate(NativeTensorDescriptor) { return NativeTensorCoordinate>( make_zero_array()); @@ -200,7 +201,7 @@ struct TensorCoordinate_v2 template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(TransformedTensorDescriptor) + MakeDummyTensorCoordinate(TransformedTensorDescriptor) { return TransformedTensorCoordinate>( make_zero_array()); diff --git a/composable_kernel/include/tensor_description/tensor_descriptor.hpp b/composable_kernel/include/tensor_description/tensor_descriptor.hpp index 175c44ae7c..21deafe0f6 100644 --- a/composable_kernel/include/tensor_description/tensor_descriptor.hpp +++ b/composable_kernel/include/tensor_description/tensor_descriptor.hpp @@ -299,7 +299,7 @@ struct TransformedTensorDescriptor return GetLowerTensorDescriptor().GetElementSpace(); } - // TODO: right now return value is constexpr because use of non-constepxr lambda + // TODO: right now return value is not constexpr because use of non-constexpr lambda __host__ __device__ static constexpr LowerIndex CalculateLowerIndex(const UpperIndex& idx_up) { LowerIndex idx_low; diff --git a/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp b/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp index 52dc1642e6..d4d80a2138 100644 --- a/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp +++ b/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp @@ -96,6 +96,81 @@ __host__ __device__ constexpr auto LowerTensorDescriptor{}, typename sequence_map_inverse::type{}); } +template +__host__ __device__ constexpr bool AreDimensionsUnfoldable(Lengths, Strides) +{ + static_assert(Lengths::Size() == Strides::Size(), "wrong!"); + + bool flag = true; + + for(index_t i = 0; i < Lengths::Size() - 1; ++i) + { + flag = flag && Strides::At(i) == Strides::At(i + 1) * Lengths::At(i + 1); + } + + return flag; +} + +// unfold only support NativeTennsorDescriptor, for now +template +__host__ __device__ constexpr auto unfold_tensor_descriptor(NativeTensorDescriptor desc, + Number, + Number) +{ + constexpr index_t nDim = desc.GetNumOfDimension(); + + static_assert(FirstUnfoldDim >= 0 && LastUnfoldDim < nDim && FirstUnfoldDim <= LastUnfoldDim, + "wrong! should have FirstUnfoldDim <= LastUnfoldDim!"); + + // left and right + constexpr auto left = typename arithmetic_sequence_gen<0, FirstUnfoldDim, 1>::type{}; + constexpr auto middle = + typename arithmetic_sequence_gen::type{}; + constexpr auto right = typename arithmetic_sequence_gen::type{}; + + // sanity-checknfoldable + static_assert(AreDimensionsUnfoldable(desc.GetLengths(middle), desc.GetStrides(middle)), + "wrong! not unfoldable"); + + // unfolded length, stride + constexpr index_t unfold_length = + reduce_on_sequence(desc.GetLengths(middle), math::multiplies{}, Number<1>{}); + + constexpr index_t unfold_stride = desc.GetStride(Number{}); + + // new lengths, strides + constexpr auto new_lengths = + desc.GetLengths(left).PushBack(Number{}).PushBack(desc.GetLengths(right)); + + constexpr auto new_strides = + desc.GetStrides(left).PushBack(Number{}).PushBack(desc.GetStrides(right)); + + return make_native_tensor_descriptor(new_lengths, new_strides); +} + +#if 0 +template +__host__ __device__ constexpr auto + pad_tensor_descriptor(LowerTensorDescriptor, PadLowerDimensionIds, LeftPads, RightPads) +{ + constexpr index_t nDim = LowerTensorDescriptor::GetNumOfDimension(); + + constexpr auto non_pad_low_dim_ids = xxx; + + return transform_tensor_descriptor( + LowerTensorDescriptor{}, + make_tuple(Pad{}) + .PushBack(PassThrough...), + make_tuple(PadLowerDimensionIds{}).PushBack(xxxx), + sequence_to_tuple(typename arithmetic_sequence_gen<0, nDim, 1> i::type{})); +} +#endif + template __host__ __device__ void print_tensor_descriptor(const char* s, const NativeTensorDescriptor& desc) diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp index d9c8060d04..2c9e5e2045 100644 --- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp @@ -738,12 +738,12 @@ struct BlockwiseGenericTensorSliceCopy_v4 __device__ void RunLoadRegisterBuffer(const TData* p_src, TData* p_buffer) const { #if 1 - mThreadwiseLoad.template Run_generic( + mThreadwiseLoad.template Run_generic( p_src, p_buffer); #else mThreadwiseLoad.template Run_optimized_src_address_calculation( + address_space_t::generic>( p_src, p_buffer); #endif } @@ -752,11 +752,11 @@ struct BlockwiseGenericTensorSliceCopy_v4 __device__ void RunStoreRegisterBuffer(const TData* p_buffer, TData* p_dst) const { #if 1 - mThreadwiseStore.template Run_generic( + mThreadwiseStore.template Run_generic( p_buffer, p_dst); #else mThreadwiseStore.template Run_optimized_dst_address_calculation(p_buffer, p_dst); #endif diff --git a/composable_kernel/include/utility/config_amd.hpp.in b/composable_kernel/include/utility/config_amd.hpp.in index dffd6fd08b..7f9dc137d4 100644 --- a/composable_kernel/include/utility/config_amd.hpp.in +++ b/composable_kernel/include/utility/config_amd.hpp.in @@ -37,11 +37,16 @@ typedef float float4_t __attribute__((ext_vector_type(4))); typedef int32_t int32x4_t __attribute__((ext_vector_type(4))); +// data type conversion template -__device__ void fused_multiply_accumulate(T& d, const T& s0, const T& s1) +struct type_convert { - d += s0 * s1; -} + template + __device__ T operator()(X x) const + { + return static_cast(x); + } +}; } // namespace ck diff --git a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp index 679009680c..0a0f988b1f 100644 --- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp +++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp @@ -33,18 +33,12 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(InDesc, constexpr auto I2 = Number<2>{}; constexpr auto I3 = Number<3>{}; -#if 1 - constexpr auto in_nchw_desc = InDesc{}; - constexpr auto wei_kcyx_desc = WeiDesc{}; - constexpr auto out_nkhw_desc = OutDesc{}; -#else constexpr auto in_nchw_desc = make_native_tensor_descriptor(InDesc::GetLengths(), InDesc::GetStrides()); constexpr auto wei_kcyx_desc = make_native_tensor_descriptor(WeiDesc::GetLengths(), WeiDesc::GetStrides()); constexpr auto out_nkhw_desc = - make_native_tensor_descriptor(OutDesc::GetLegnths(), OutDesc::GetStrides()); -#endif + make_native_tensor_descriptor(OutDesc::GetLengths(), OutDesc::GetStrides()); constexpr index_t N = out_nkhw_desc.GetLength(I0); constexpr index_t K = out_nkhw_desc.GetLength(I1); diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp index a5e03506b5..98139081ca 100644 --- a/driver/src/driver.cpp +++ b/driver/src/driver.cpp @@ -295,7 +295,7 @@ int main(int argc, char* argv[]) using LeftPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>; -#elif 1 +#elif 0 // 3x3 filter, 2x2 stride, 35x35 input, 17x17 output // cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81% constexpr index_t N = 128; @@ -341,7 +341,7 @@ int main(int argc, char* argv[]) using LeftPads = Sequence<3, 0>; using RightPads = Sequence<3, 0>; -#elif 0 +#elif 1 // 1x7 filter, 0x3 pad, 17x17 input constexpr index_t N = 128; constexpr index_t C = 128;