From 51884fc21412b1800bb85b28c0f5a0b651d23cef Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Sat, 21 Sep 2019 22:53:03 -0500 Subject: [PATCH] WIP: explicitly separate offset component into compile-time, block-invariant and per-thread components --- ...chw_kcyx_nkhw_padded_lds_double_buffer.hpp | 15 ++- .../multi_index_transform.hpp | 44 +++++++ .../tensor_description/tensor_descriptor.hpp | 120 +++++++++++------- .../tensor_descriptor_helper.hpp | 13 ++ .../blockwise_generic_tensor_slice_copy.hpp | 48 ++++--- .../threadwise_generic_tensor_slice_copy.hpp | 115 +++++++++++++++++ composable_kernel/include/utility/array.hpp | 14 ++ .../include/utility/sequence.hpp | 12 +- composable_kernel/include/utility/tuple.hpp | 20 ++- ...plicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp | 2 +- driver/src/driver.cpp | 10 +- 11 files changed, 337 insertions(+), 76 deletions(-) diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp index 0a5b4a3c34..2a934fb9cb 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp @@ -3,7 +3,8 @@ #include "common_header.hpp" #include "ConstantTensorDescriptor.hpp" -#include "ConstantMergedTensorDescriptor.hpp" +#include "tensor_descriptor.hpp" +#include "tensor_descriptor_helper.hpp" #include "ConstantMatrixDescriptor.hpp" #include "blockwise_generic_tensor_slice_copy.hpp" #include "blockwise_gemm.hpp" @@ -172,6 +173,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf BlockwiseGenericTensorSliceCopy_v4, + Sequence<1, 0, 1, 0>, + Sequence<1, 1, 1, 1>, + Sequence<0, 0, 0, 0>, decltype(in_e_n1_b_n2_block_desc.GetLengths()), InBlockCopySubLengths_E_N1_B_N2, InBlockCopyClusterLengths_E_N1_B_N2, @@ -213,6 +218,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf BlockwiseGenericTensorSliceCopy_v4, + Sequence<0, 0>, + Sequence<1, 1>, + Sequence<0, 0>, decltype(wei_e_k_block_desc.GetLengths()), WeiBlockCopySubLengths_E_K, WeiBlockCopyClusterLengths_E_K, @@ -414,6 +423,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf ThreadwiseGenericTensorSliceCopy_v4r2, + Sequence<0, 0, 0, 0, 0>, + Sequence<1, 1, 1, 0, 1>, + Sequence<0, 0, 0, 1, 0>, decltype( out_k0_k1_n1_b_n2_thread_desc.GetLengths()), arithmetic_sequence_gen<0, 5, 1>::type, diff --git a/composable_kernel/include/tensor_description/multi_index_transform.hpp b/composable_kernel/include/tensor_description/multi_index_transform.hpp index bf56678b63..47f2d97089 100644 --- a/composable_kernel/include/tensor_description/multi_index_transform.hpp +++ b/composable_kernel/include/tensor_description/multi_index_transform.hpp @@ -368,5 +368,49 @@ struct Embed } }; +template +struct Vectorize +{ + using LowerIndex = MultiIndex<1>; + using UpperIndex = MultiIndex<1>; + + __host__ __device__ constexpr Vectorize() + { + static_assert(VectorSize > 0 && LowerLength % VectorSize == 0, + "wrong! cannot evenly divide"); + } + + __host__ __device__ static constexpr auto GetNumOfLowerDimension() { return Number<1>{}; } + + __host__ __device__ static constexpr auto GetNumOfUpperDimension() { return Number<1>{}; } + + __host__ __device__ static constexpr auto GetUpperLengths() + { + return Sequence{}; + } + + __host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up) + { + return VectorSize * idx_up; + } + + __host__ __device__ static constexpr auto + CalculateLowerIndexDiff(const UpperIndex& idx_up_diff, + const UpperIndex& /* idx_up_old */, + const LowerIndex& /* idx_low_old */) + { + return VectorSize * idx_up_diff; + } + + __host__ __device__ static constexpr bool IsLinearTransform() { return true; } + + // TODO: should this function be here? should it be specific for padding check? + __host__ __device__ static constexpr bool + IsUpperIndexInPaddingArea(const UpperIndex& /* idx_up */) + { + return false; + } +}; + } // namespace ck #endif diff --git a/composable_kernel/include/tensor_description/tensor_descriptor.hpp b/composable_kernel/include/tensor_description/tensor_descriptor.hpp index 4d0acd7993..f307286f70 100644 --- a/composable_kernel/include/tensor_description/tensor_descriptor.hpp +++ b/composable_kernel/include/tensor_description/tensor_descriptor.hpp @@ -101,17 +101,24 @@ struct NativeTensorDescriptor return true; } - __host__ __device__ static constexpr auto GetLinearDimensions() + __host__ __device__ static constexpr auto GetMaskOfLinearDimensions() { - return typename arithmetic_sequence_gen<0, nDim, 1>::type{}; + return typename uniform_sequence_gen::type{}; + } + + __host__ __device__ static constexpr auto GetMaskOfNonLinearDimensions() + { + return typename uniform_sequence_gen::type{}; } __host__ __device__ static constexpr auto GetNonLinearDimensions() { return Sequence<>{}; } +#if 0 __host__ __device__ static constexpr auto GetNonLinearIndependentDimensionGroups() { return Tuple<>{}; } +#endif // TODO: should this function be here? should it be specific for padding check? __host__ __device__ static constexpr bool IsUpperIndexInPaddingArea(const Index& /* idx */) @@ -233,7 +240,7 @@ struct TransformedTensorDescriptor __host__ __device__ static constexpr auto GetUpperLengths() { constexpr auto tuple_of_up_lengths = - transform_tuple(lambda_GetUpperLengths{}, Transforms{}); + transform_tuples(lambda_GetUpperLengths{}, Transforms{}); constexpr auto mingled_up_lengths = unpack(lambda_merge_sequences{}, tuple_of_up_lengths); @@ -346,67 +353,92 @@ struct TransformedTensorDescriptor return GetLowerTensorDescriptor().CalculateOffset(CalculateLowerIndex(idx_up)); } -#if 1 +#if 0 struct lambda_sequence_logic_or { template __host__ __device__ constexpr auto operator()(Seqs... seqs) const { // TODO: should use math::logic_or, after Sequence can take bool - return typename sequence_reduce, Seqs...>::type{}; + return typename sequence_reduce, Seqs...>::type{}; } }; struct lambda_1 { - template - __host__ __device__ constexpr auto operator()(const Transform& tran) const + // check only one transform at a time + template + __host__ __device__ constexpr auto + operator()(const Transform& tran, LowDimensionId, UpDimensionId) const { - return tran.GetUpperLengths(); + // judge if transformation is linear + constexpr bool is_linear_transform = tran.IsLinearTransform(); + + // judge if all lower dimension are linear + constexpr bool is_all_low_dim_linear = math::accumulate_on_sequence( + pick_sequence_elements_by_mask( + GetLowerTensorDescriptor().GetMaskOfLinearDimensions(), LowDimensionId{}), + math::logic_and{}, + integral_constant{}); + + // judge if upper dimenisons are linear + constexpr bool is_up_dim_nonlinear = !(is_linear_transform && is_all_low_dim_linear); + + constexpr auto value_sequence = + typename uniform_sequence_gen::type{}; + + constexpr auto mask_of_up_nonlinear_dims = modifiy_sequence( + typename uniform_sequence_gen::type{}, value_sequence, UpDimensionId{}); + + return mask_of_up_nonlinear_dims; + }; + + __host__ __device__ static constexpr bool GetMaskOfNonLinearDimensions() + { + // create tuple of linear dimension masks, for all transformations + constexpr auto tuple_of_nonlinear_dimension_mask = + transform_tuples(lambda_1{}, Transforms{}, LowDimensionIds{}, UpDimensionIds{}); + + // reduce tuple of masks into one mask + constexpr auto nonlinear_dimension_mask = + unpack(lambda_sequence_logic_or{}, tuple_of_nonlinear_dimension_mask); + + return nonlinear_dimension_mask; } - }; - template - __host__ __device__ static constexpr bool GetMaskOfLinearDimensions() - { - // create tuple of linear dimension masks, for all transformations - constexpr auto tuple_of_linear_dimension_mask = - transform_tuple(lambda_1, Transforms{}); + __host__ __device__ static constexpr bool GetMaskOfLinearDimensions() + { + return GetMaskOfNonLinearDimensions().Transform(math::logic_not{}); + } - // reduce tuple of masks into one mask - constexpr auto linear_dimension_mask = - unpack(lambda_sequence_logic_or{}, tuple_of_linear_dimension_mask); + template + __host__ __device__ static constexpr bool IsLinearDimension(Number) + { + return GetMaskOfLinearDimensions().At(Number{}); + } - return linear_dimension_mask; - } + __host__ __device__ static constexpr auto GetLinearDimensions() + { + constexpr auto linear_dimension_mask = GetMaskOfLienarDimensions(); - template - __host__ __device__ static constexpr bool IsLinearDimension(Number) - { - return GetMaskOfLinearDimensions().At(Number{}); - } + return pick_sequence_elements_by_mask( + typename arithmetic_sequence_gen<0, nDimUp, 1>::type{}, linear_dimension_mask); + } - __host__ __device__ static constexpr auto GetLinearDimensions() - { - constexpr auto linear_dimension_mask = GetMaskOfLienarDimensions(); + __host__ __device__ static constexpr auto GetNonLinearDimensions() + { + constexpr auto nonlinear_dimension_mask = + GetMaskOfLienarDimensions().Transform(math::logic_not{}); - return pick_sequence_elements_by_mask( - typename arithmetic_sequence_gen<0, nDimUp, 1>::type{}, linear_dimension_mask); - } + return pick_sequence_elements_by_mask( + typename arithmetic_sequence_gen<0, nDimUp, 1>::type{}, nonlinear_dimension_mask); + } - __host__ __device__ static constexpr auto GetNonLinearDimensions() - { - constexpr auto nonlinear_dimension_mask = - GetMaskOfLienarDimensions().Transform(math::logic_not{}); - - return pick_sequence_elements_by_mask( - typename arithmetic_sequence_gen<0, nDimUp, 1>::type{}, nonlinear_dimension_mask); - } - - __host__ __device__ static constexpr auto GetNonLinearIndependentDimensionGroups() - { - // not implemented - } + __host__ __device__ static constexpr auto GetNonLinearIndependentDimensionGroups() + { + // not implemented + } #endif // TODO: should this function be here? should it be specific for padding check? diff --git a/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp b/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp index 52dc1642e6..12517071ec 100644 --- a/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp +++ b/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp @@ -96,6 +96,19 @@ __host__ __device__ constexpr auto LowerTensorDescriptor{}, typename sequence_map_inverse::type{}); } +template +__host__ __device__ constexpr auto +vectorize_tensor_descriptor(LowerTensorDescriptor, Number vector_dim, Number) +{ + constexpr index_t nDim = LowerTensorDescriptor::GetNumOfDimension(); + + return transform_tensor_descriptor( + LowerTensorDescriptor{}, + Vectorize{}, + typename arithmetic_sequence_gen<0, nDim, 1>::type{}, + typename arithmetic_sequence_gen<0, nDim, 1>::type{}); +} + template __host__ __device__ void print_tensor_descriptor(const char* s, const NativeTensorDescriptor& desc) diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp index 25cd5a819c..25349dc9f9 100644 --- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp @@ -680,6 +680,10 @@ struct BlockwiseGenericTensorSliceCopy_v3 template (p_src, p_buffer); #endif @@ -750,7 +756,7 @@ struct BlockwiseGenericTensorSliceCopy_v4 { #if 0 mThreadwiseStore.Run(p_buffer, p_dst); -#else +#elif 1 // hardcoded: register to LDS mThreadwiseStore.template Run_amd_experiment(p_buffer, p_dst); #endif @@ -784,21 +790,31 @@ struct BlockwiseGenericTensorSliceCopy_v4 private: using RegisterBufferDesc = decltype(make_native_tensor_descriptor_packed(SubLengths{})); - using ThreadwiseLoad = ThreadwiseGenericTensorSliceCopy_v4r2; + using ThreadwiseLoad = + ThreadwiseGenericTensorSliceCopy_v4r2::type, + typename uniform_sequence_gen::type, + SubLengths, + SrcDimAccessOrder, + SrcVectorAccessDim, + SrcDataPerAccess, + 1>; - using ThreadwiseStore = ThreadwiseGenericTensorSliceCopy_v4r2; + using ThreadwiseStore = + ThreadwiseGenericTensorSliceCopy_v4r2::type, + typename uniform_sequence_gen::type, + DstLinearDimensionMask, + DstNonLinearDimensionMask, + SubLengths, + DstDimAccessOrder, + DstVectorAccessDim, + 1, + DstDataPerAccess>; ThreadwiseLoad mThreadwiseLoad; ThreadwiseStore mThreadwiseStore; diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp index 67b0969a0e..d8ecf6508b 100644 --- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp @@ -1136,6 +1136,10 @@ struct ThreadwiseGenericTensorSliceCopy_v3r1 // the other is device memory or LDS template + __device__ static constexpr auto mask_lengths(Sequence, Sequence) + { + return Sequence<(Mask ? Lengths : 1)...>{}; + } + + template + __device__ void Run_access_order_optimized_for_source_index_calculation(const TData* p_src, + TData* p_dst) const + { + using src_vector_t = typename vector_type::MemoryType; + using dst_vector_t = typename vector_type::MemoryType; + + constexpr auto vector_access_dim = Number{}; + + constexpr auto src_data_per_access = Number{}; + constexpr auto dst_data_per_access = Number{}; + + constexpr auto long_vector_size = Number{}; + + constexpr auto long_vector_access_lengths = SliceLengths::Modify( + vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size); + + // TODO:: don't use hack + constexpr auto src_linear_dim_mask = SrcLinearDimensionMask{}; + constexpr auto src_nonlinear_dim_mask = SrcNonLinearDimensionMask{}; + + // separate steps into linear and non-linear components + constexpr auto linear_long_vector_access_lengths = + mask_lengths(long_vector_access_lengths, src_linear_dim_mask); + + constexpr auto nonlinear_long_vector_access_lengths = + mask_lengths(long_vector_access_lengths, src_nonlinear_dim_mask); + + // loop over src's non-linear dimensions + ford{}( + [&](auto nonlinear_dim_long_vector_access_id) { + + // step-sizes along src's nonlinear dimensions + auto nonlinear_dim_data_steps = nonlinear_dim_long_vector_access_id; + nonlinear_dim_data_steps(vector_access_dim) = + long_vector_size * nonlinear_dim_long_vector_access_id[vector_access_dim]; + + // move src cooridnate along nonlinear dimensions + const auto src_nonlinear_coord = mSrcSliceOrigin + nonlinear_dim_data_steps; + + // loop over src's linear dimensions + ford{}( + [&](auto linear_dim_long_vector_access_id) { + + // step-sizes along src's linear dimensions + auto linear_dim_data_steps = linear_dim_long_vector_access_id; + linear_dim_data_steps(vector_access_dim) = + long_vector_size * linear_dim_long_vector_access_id[vector_access_dim]; + + // buffer to hold a long-vector + TData p_long_vector[long_vector_size]; + + // set 0 + for(index_t i = 0; i < long_vector_size; ++i) + { + p_long_vector[i] = 0; + } + + // load data from src to the long-vector buffer + for(index_t i = 0; i < long_vector_size / src_data_per_access; ++i) + { + auto scalar_id = make_zero_array(); + scalar_id(vector_access_dim) = i * src_data_per_access; + + // move src cooridnate along linear dimensions + const auto src_coord = + src_nonlinear_coord + (linear_dim_data_steps + scalar_id); + + // TODO: good implementation? + const index_t src_linear_offset_diff = + src_coord.GetOffset() - src_nonlinear_coord.GetOffset(); + + // check for padding + // TODO: still kind of messy + if(!src_coord.IsAnyLevelIndexInPaddingArea()) + { + const index_t src_offset = src_coord.GetOffset(); + + const index_t buffer_offset = i * src_data_per_access; + + *reinterpret_cast(&p_long_vector[buffer_offset]) = + *reinterpret_cast(&p_src[src_offset]); + } + } + + // store data from the long-vector buffer to dst + for(index_t i = 0; i < long_vector_size / dst_data_per_access; ++i) + { + auto scalar_id = make_zero_array(); + scalar_id(vector_access_dim) = i * dst_data_per_access; + + const index_t buffer_offset = i * dst_data_per_access; + + const index_t dst_offset = + (mDstSliceOrigin + + (nonlinear_dim_data_steps + linear_dim_data_steps + scalar_id)) + .GetOffset(); + + *reinterpret_cast(&p_dst[dst_offset]) = + *reinterpret_cast(&p_long_vector[buffer_offset]); + } + }); + }); + } + // memory-space // 0: VGPR // 1: LDS diff --git a/composable_kernel/include/utility/array.hpp b/composable_kernel/include/utility/array.hpp index 7b0c25d41b..b0ffa86785 100644 --- a/composable_kernel/include/utility/array.hpp +++ b/composable_kernel/include/utility/array.hpp @@ -389,6 +389,20 @@ __host__ __device__ constexpr auto operator-(Sequence a, Array +__host__ __device__ constexpr auto operator*(TData v, Array a) +{ + Array result; + + for(index_t i = 0; i < NSize; ++i) + { + result(i) = a[i] * v; + } + + return result; +} + template __host__ __device__ constexpr TData accumulate_on_array(const Array& a, Reduce f, TData init) diff --git a/composable_kernel/include/utility/sequence.hpp b/composable_kernel/include/utility/sequence.hpp index c351754140..55bda8aede 100644 --- a/composable_kernel/include/utility/sequence.hpp +++ b/composable_kernel/include/utility/sequence.hpp @@ -706,18 +706,18 @@ __host__ __device__ constexpr auto sequence_pop_back(Seq) return sequence_pop_front(Seq::Reverse()).Reverse(); } -template -__host__ __device__ constexpr auto transform_sequences(F f, Sequence) -{ - return Sequence{}; -} - template __host__ __device__ constexpr auto merge_sequences(Seqs...) { return typename sequence_merge::type{}; } +template +__host__ __device__ constexpr auto transform_sequences(F f, Sequence) +{ + return Sequence{}; +} + template __host__ __device__ constexpr auto transform_sequences(F f, Sequence, Sequence) { diff --git a/composable_kernel/include/utility/tuple.hpp b/composable_kernel/include/utility/tuple.hpp index 27175fe625..815ca8466e 100644 --- a/composable_kernel/include/utility/tuple.hpp +++ b/composable_kernel/include/utility/tuple.hpp @@ -113,19 +113,33 @@ __host__ __device__ constexpr auto make_tuple(Xs&&... xs) namespace detail { template -__host__ __device__ constexpr auto transform_tuple_impl(F f, const X& x, Sequence) +__host__ __device__ constexpr auto transform_tuples_impl(F f, const X& x, Sequence) { return make_tuple(f(x.At(Number{}))...); } +template +__host__ __device__ constexpr auto +transform_tuples_impl(F f, const X& x, const Y& y, Sequence) +{ + return make_tuple(f(x.At(Number{}), y.At(Number{}))...); +} + } // namespace detail template -__host__ __device__ constexpr auto transform_tuple(F f, const X& x) +__host__ __device__ constexpr auto transform_tuples(F f, const X& x) { - return detail::transform_tuple_impl( + return detail::transform_tuples_impl( f, x, typename arithmetic_sequence_gen<0, X::Size(), 1>::type{}); } +template +__host__ __device__ constexpr auto transform_tuples(F f, const X& x, const Y& y) +{ + return detail::transform_tuples_impl( + f, x, y, typename arithmetic_sequence_gen<0, X::Size(), 1>::type{}); +} + } // namespace ck #endif diff --git a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp index ac95e09d7e..17813f09a4 100644 --- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp +++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp @@ -3,7 +3,7 @@ #include "device.hpp" #include "tensor.hpp" #include "gridwise_convolution_kernel_wrapper.hpp" -#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp" +//#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp" #include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp" template