From 724e984bfffdbe45b98d31c349d24998ed58b541 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Wed, 11 Sep 2019 01:13:13 -0500 Subject: [PATCH] enabling padding for chwn format --- ...plicit_gemm_v1r3_chwn_cyxk_khwn_padded.hpp | 371 ++++++++---------- .../multi_index_transform.hpp | 65 ++- .../tensor_coordinate_v2.hpp | 140 +++++-- .../tensor_description/tensor_descriptor.hpp | 78 +++- .../blockwise_generic_tensor_slice_copy.hpp | 161 +++++++- .../threadwise_generic_tensor_slice_copy.hpp | 167 +++++++- composable_kernel/include/utility/array.hpp | 17 +- .../include/utility/array_helper.hpp | 88 ++++- .../include/utility/config_nvidia.hpp.in | 11 +- .../include/utility/sequence.hpp | 6 +- ...implicit_gemm_v1_chwn_cyxk_khwn_padded.hpp | 2 +- driver/src/driver.cpp | 44 +-- 12 files changed, 817 insertions(+), 333 deletions(-) diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_padded.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_padded.hpp index 2c5e1e087b..cfa8e9f02d 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_padded.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_padded.hpp @@ -18,8 +18,8 @@ template {}; static constexpr auto I11 = Number<11>{}; -#if 0 + static constexpr auto True = integral_constant{}; + static constexpr auto False = integral_constant{}; + +#if 1 __device__ void Run(const Float* const __restrict__ p_in_global, const Float* const __restrict__ p_wei_global, Float* const __restrict__ p_out_global) const @@ -73,14 +76,22 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded GemmNPerThreadSubC % NPerThread == 0)), "wrong!"); - constexpr auto True = integral_constant{}; - constexpr auto False = integral_constant{}; + constexpr auto in_c_h_w_n_global_desc_old = InGlobalDesc{}; + constexpr auto wei_c_y_x_k_global_desc_old = WeiGlobalDesc{}; + constexpr auto out_k_h_w_n_global_desc_old = OutGlobalDesc{}; - constexpr auto in_c_h_w_n_global_desc = InGlobalDesc{}; - constexpr auto wei_c_y_x_k_global_desc = WeiGlobalDesc{}; - constexpr auto out_k_h_w_n_global_desc = OutGlobalDesc{}; + constexpr auto in_c_h_w_n_global_desc = make_native_tensor_descriptor( + in_c_h_w_n_global_desc_old.GetLengths(), in_c_h_w_n_global_desc_old.GetStrides()); - constexpr index_t C = in_c_h_w_n_global_desc.GetLength(I0); + constexpr auto wei_c_y_x_k_global_desc = make_native_tensor_descriptor( + wei_c_y_x_k_global_desc_old.GetLengths(), wei_c_y_x_k_global_desc_old.GetStrides()); + + constexpr auto out_k_h_w_n_global_desc = make_native_tensor_descriptor( + out_k_h_w_n_global_desc_old.GetLengths(), out_k_h_w_n_global_desc_old.GetStrides()); + + constexpr index_t C = in_c_h_w_n_global_desc.GetLength(I0); + constexpr index_t Hi = in_c_h_w_n_global_desc.GetLength(I1); + constexpr index_t Wi = in_c_h_w_n_global_desc.GetLength(I2); constexpr index_t K = out_k_h_w_n_global_desc.GetLength(I0); constexpr index_t Ho = out_k_h_w_n_global_desc.GetLength(I1); @@ -111,11 +122,22 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded const index_t wo_block_data_begin = block_work_multi_id[2] * WoPerBlock; const index_t n_block_data_begin = block_work_multi_id[3] * NPerBlock; - const index_t hi_block_data_begin = ho_block_data_begin; - const index_t wi_block_data_begin = wo_block_data_begin; + const index_t hi_block_data_begin = ho_block_data_begin - LeftPads{}[0]; + const index_t wi_block_data_begin = wo_block_data_begin - LeftPads{}[1]; + + // input global tensor view + constexpr auto in_c_hp_wp_n_global_desc = transform_tensor_descriptor( + in_c_h_w_n_global_desc, + make_tuple( + PassThrough{}, Pad, LeftPads, RightPads>{}, PassThrough{}), + make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}), + make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{})); // global tensor view - constexpr auto wei_c_k_global_desc = wei_c_y_x_k_global_desc.Extract(I0, I3); + constexpr auto wei_c_k_global_desc_old = wei_c_y_x_k_global_desc_old.Extract(I0, I3); + + constexpr auto wei_c_k_global_desc = make_native_tensor_descriptor( + wei_c_k_global_desc_old.GetLengths(), wei_c_k_global_desc_old.GetStrides()); // LDS tensor view // be careful of alignment @@ -124,122 +146,81 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded GemmDataPerReadA, GemmDataPerReadB); - constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_aligned( + constexpr auto in_c_h_w_n_block_desc_old = make_ConstantTensorDescriptor_aligned( Sequence{}, Number{}); + // hack + constexpr auto in_c_h_w_n_block_desc = make_native_tensor_descriptor( + in_c_h_w_n_block_desc_old.GetLengths(), in_c_h_w_n_block_desc_old.GetStrides()); + // this check is ad-hoc // TODO: need to properly implement tensor descriptor with alignment static_assert(in_c_h_w_n_block_desc.GetStride(I1) % GemmDataPerReadB == 0, "GemmDataPerReadB alignment requirement is not meet"); - constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned( + constexpr auto wei_c_k_block_desc_old = make_ConstantTensorDescriptor_aligned( Sequence{}, Number{}); - constexpr auto wei_c_1_1_k_block_desc = make_ConstantTensorDescriptor_aligned( - Sequence{}, Number{}); + constexpr auto wei_c_k_block_desc = make_native_tensor_descriptor( + wei_c_k_block_desc_old.GetLengths(), wei_c_k_block_desc_old.GetStrides()); // LDS: be careful of alignment - constexpr index_t in_block_space = in_c_h_w_n_block_desc.GetElementSpace(); - constexpr index_t wei_block_space = wei_c_k_block_desc.GetElementSpace(); + constexpr index_t in_block_space = in_c_h_w_n_block_desc_old.GetElementSpace(); + constexpr index_t wei_block_space = wei_c_k_block_desc_old.GetElementSpace(); __shared__ Float p_in_block[in_block_space]; __shared__ Float p_wei_block[wei_block_space]; // tensor view of threadwise output in register - constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_packed( + constexpr auto out_k_h_w_n_thread_desc_old = make_ConstantTensorDescriptor_packed( Sequence{}); -#if 1 + constexpr auto out_k_h_w_n_thread_desc = make_native_tensor_descriptor( + out_k_h_w_n_thread_desc_old.GetLengths(), out_k_h_w_n_thread_desc_old.GetStrides()); + // blockwise input copy // format is [C, Hi, Wi, N] auto blockwise_in_copy = - BlockwiseGenericTensorSliceCopy_v2, - Sequence<0, 1, 2, 3>, - Sequence<0, 1, 2, 3>, - 3, - 3, - InBlockCopyDataPerAccess_N, - InBlockCopyDataPerAccess_N>({0, 0, 0, 0}, - {0, 0, 0, 0}); +#if 0 + BlockwiseGenericTensorSliceCopy_v2 #else - auto in_c_h_w_n_global = make_TensorView(in_c_h_w_n_global_desc, p_in_global); - auto in_c_h_w_n_block = make_TensorView(in_c_h_w_n_block_desc, p_in_block); - - auto blockwise_in_copy = - BlockwiseGenericTensorSliceCopy_v3, - Sequence<0, 1, 2, 3>, - Sequence<0, 1, 2, 3>, - 3, - 3, - InBlockCopyDataPerAccess_N, - InBlockCopyDataPerAccess_N>( - in_c_h_w_n_global, - {0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin}, - in_c_h_w_n_block, - {0, 0, 0, 0}); + BlockwiseGenericTensorSliceCopy_v4 #endif + , + Sequence<0, 1, 2, 3>, + Sequence<0, 1, 2, 3>, + 3, + 3, + InBlockCopyDataPerAccess_N, + InBlockCopyDataPerAccess_N>({0, 0, 0, 0}, {0, 0, 0, 0}); -#if 1 // blockwise wei copy // format is [CPerBlock, KPerBlock] const auto blockwise_wei_copy = - BlockwiseGenericTensorSliceCopy_v2, - Sequence<0, 1>, - Sequence<0, 1>, - 1, - 1, - WeiBlockCopyDataPerAccess_K, - WeiBlockCopyDataPerAccess_K>({0, 0}, {0, 0}); +#if 0 + BlockwiseGenericTensorSliceCopy_v2 #else - auto wei_c_y_x_k_global = make_TensorView(wei_c_y_x_k_global_desc, p_wei_global); - auto wei_c_1_1_k_block = make_TensorView(wei_c_1_1_k_block_desc, p_wei_block); - - constexpr index_t WeiBlockCopySubLengths_C = WeiBlockCopySubLengths_CK{}[0]; - constexpr index_t WeiBlockCopySubLengths_K = WeiBlockCopySubLengths_CK{}[1]; - - using WeiBlockCopySubLengths_CYXK = - Sequence; - - constexpr index_t WeiBlockCopyClusterLengths_C = WeiBlockCopyClusterLengths_CK{}[0]; - constexpr index_t WeiBlockCopyClusterLengths_K = WeiBlockCopyClusterLengths_CK{}[1]; - - using WeiBlockCopyClusterLengths_CYXK = - Sequence; - - auto blockwise_wei_copy = - BlockwiseGenericTensorSliceCopy_v3, - Sequence<0, 1, 2, 3>, - Sequence<0, 1, 2, 3>, - 3, - 3, - WeiBlockCopyDataPerAccess_K, - WeiBlockCopyDataPerAccess_K>( - wei_c_y_x_k_global, {0, 0, 0, k_block_data_begin}, wei_c_1_1_k_block, {0, 0, 0, 0}); + BlockwiseGenericTensorSliceCopy_v4 #endif + , + Sequence<0, 1>, + Sequence<0, 1>, + 1, + 1, + WeiBlockCopyDataPerAccess_K, + WeiBlockCopyDataPerAccess_K>({0, 0}, {0, 0}); // a series of blockwise batched GEMM // C_matrix += transpose(A_matrix) * B_matrix @@ -283,7 +264,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded // register // C++ lambda doesn't capture array, use pointer instead - Float p_out_thread_data[out_k_h_w_n_thread_desc.GetElementSpace()]; + Float p_out_thread_data[out_k_h_w_n_thread_desc_old.GetElementSpace()]; Float* const p_out_thread = p_out_thread_data; // set threadwise output tensor to 0 @@ -296,12 +277,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded { const Float* p_in_global_block_offset = p_in_global + - in_c_h_w_n_global_desc.GetOffsetFromMultiIndex( - 0, hi_block_data_begin + y, wi_block_data_begin + x, n_block_data_begin); + in_c_h_w_n_global_desc.CalculateOffset( + {0, hi_block_data_begin + y, wi_block_data_begin + x, n_block_data_begin}); const Float* p_wei_global_block_offset = p_wei_global + - wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, x, k_block_data_begin); + wei_c_y_x_k_global_desc.CalculateOffset({0, y, x, k_block_data_begin}); for(index_t c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock, @@ -390,25 +371,30 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded constexpr index_t K2 = GemmMPerThreadSubC; constexpr index_t K1 = KPerBlock / KPerThread; - constexpr auto out_10d_global_desc = fwd(out_k_h_w_n_global_desc) - .Fold(I3, Number{}, Number{}) - .Fold(I2, Number{}, Number{}) - .Fold(I0, Number{}, Number{}); + constexpr auto out_10d_global_desc_old = fwd(out_k_h_w_n_global_desc_old) + .Fold(I3, Number{}, Number{}) + .Fold(I2, Number{}, Number{}) + .Fold(I0, Number{}, Number{}); - constexpr auto out_10d_thread_desc = fwd(out_k_h_w_n_thread_desc) - .Fold(I3, Number<1>{}, Number{}) - .Fold(I2, Number{}, Number<1>{}) - .Fold(I0, Number<1>{}, Number{}); + constexpr auto out_10d_global_desc = make_native_tensor_descriptor( + out_10d_global_desc_old.GetLengths(), out_10d_global_desc_old.GetStrides()); - Float* p_out_thread_on_global = p_out_global + - out_k_h_w_n_global_desc.GetOffsetFromMultiIndex( - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin, - n_block_data_begin + n_thread_data_begin); + constexpr auto out_10d_thread_desc_old = fwd(out_k_h_w_n_thread_desc_old) + .Fold(I3, Number<1>{}, Number{}) + .Fold(I2, Number{}, Number<1>{}) + .Fold(I0, Number<1>{}, Number{}); -#if 1 - ThreadwiseGenericTensorSliceCopy_v1r2::type, @@ -417,19 +403,6 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded OutThreadCopyDataPerAccess_N>( make_zero_array(), make_zero_array()) .Run(p_out_thread, p_out_thread_on_global); -#elif 0 - ThreadwiseGenericTensorSliceCopy_v1r1::type, - arithmetic_sequence_gen<0, 10, 1>::type, - 9, - 9, - OutThreadCopyDataPerAccess_N, - OutThreadCopyDataPerAccess_N>( - make_zero_array(), make_zero_array()) - .Run(p_out_thread, p_out_thread_on_global); -#endif }).Else([&](auto fwd) { static_assert(fwd(GemmNPerThreadSubC) >= NPerBlock && NPerThread == NPerBlock && GemmNPerThreadSubC % NPerThread == 0, @@ -445,27 +418,32 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded constexpr index_t K2 = GemmMPerThreadSubC; constexpr index_t K1 = KPerBlock / KPerThread; - constexpr auto out_10d_global_desc = - fwd(out_k_h_w_n_global_desc) + constexpr auto out_10d_global_desc_old = + fwd(out_k_h_w_n_global_desc_old) .Fold(I3, Number{}) .Fold(I2, Number{}, Number{}, Number{}) .Fold(I0, Number{}, Number{}); - constexpr auto out_10d_thread_desc = - fwd(out_k_h_w_n_thread_desc) + constexpr auto out_10d_global_desc = make_native_tensor_descriptor( + out_10d_global_desc_old.GetLengths(), out_10d_global_desc_old.GetStrides()); + + constexpr auto out_10d_thread_desc_old = + fwd(out_k_h_w_n_thread_desc_old) .Fold(I3, Number{}) .Fold(I2, Number{}, Number<1>{}, Number{}) .Fold(I0, Number<1>{}, Number{}); - Float* p_out_thread_on_global = p_out_global + - out_k_h_w_n_global_desc.GetOffsetFromMultiIndex( - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin, - n_block_data_begin + n_thread_data_begin); + constexpr auto out_10d_thread_desc = make_native_tensor_descriptor( + out_10d_thread_desc_old.GetLengths(0), out_10d_thread_desc_old.GetStrides()); -#if 1 - ThreadwiseGenericTensorSliceCopy_v1r2::type, @@ -474,58 +452,13 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded OutThreadCopyDataPerAccess_N>( make_zero_array(), make_zero_array()) .Run(p_out_thread, p_out_thread_on_global); -#elif 0 - ThreadwiseGenericTensorSliceCopy_v1r1::type, - arithmetic_sequence_gen<0, 10, 1>::type, - 9, - 9, - OutThreadCopyDataPerAccess_N, - OutThreadCopyDataPerAccess_N>( - make_zero_array(), make_zero_array()) - .Run(p_out_thread, p_out_thread_on_global); -#endif }); } -#else +#elif 0 __device__ void Run(const Float* const __restrict__ p_in_global, const Float* const __restrict__ p_wei_global, Float* const __restrict__ p_out_global) const { -#if 0 - constexpr auto a = make_tuple(true, Sequence<1>{}, index_t(99)); - - if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) - { - printf("[0] %d\n", a.At(I0)); - print_Sequence("[1]", a.At(I1)); - printf("[2] %lu\n", a.At(I2)); - } - - bool flag = true; - - auto b = make_tuple(flag, Sequence<1>{}, 99); - - b.At(I0) = false; - - if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) - { - printf("[0] %d\n", b.At(I0)); - print_Sequence("[1]", b.At(I1)); - printf("[2] %lu\n", b.At(I2)); - - printf("flag %d\n", flag); - } - - if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) - { - printf("[0] %d\n", make_tuple(true, Sequence<1>(), index_t(99)).At(I0)); - print_Sequence("[1]", make_tuple(true, Sequence<1>(), index_t(99)).At(I1)); - printf("[2] %d\n", make_tuple(true, Sequence<1>(), index_t(99)).At(I2)); - } -#elif 1 // create a native tensor descriptor constexpr auto in_c_h_w_n_global_desc = make_native_tensor_descriptor(InGlobalDesc::GetLengths(), InGlobalDesc::GetStrides()); @@ -540,11 +473,10 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded constexpr auto in_n_c_hp_wp_global_desc = transform_tensor_descriptor( in_c_h_w_n_global_desc, make_tuple( - Pad, LowerPads, UpperPads>{}, PassThrough{}, PassThrough{}), + Pad, LeftPads, RightPads>{}, PassThrough{}, PassThrough{}), make_tuple(Sequence<1, 2>{}, Sequence<0>{}, Sequence<3>{}), make_tuple(Sequence<2, 3>{}, Sequence<1>{}, Sequence<0>{})); -#if 1 // transformation: {n, c, hp, wp} --> {c, b} // {n, hp, wp} --> {b}, {c} --> {c} constexpr auto in_c_b_global_desc = transform_tensor_descriptor( @@ -553,9 +485,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded PassThrough{}), make_tuple(Sequence<0, 2, 3>{}, Sequence<1>{}), make_tuple(Sequence<1>{}, Sequence<0>{})); -#endif -#if 1 if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) { // 0 @@ -577,16 +507,55 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_padded printf("in_c_b_global_desc offset: %lu\n", in_c_b_global_desc.CalculateOffset(idx2)); } + } #else - { - index_t c = static_cast(threadIdx.x); - index_t h = static_cast(threadIdx.y); - index_t w = static_cast(threadIdx.z); + __device__ void Run(const Float* const __restrict__ p_in_global, + const Float* const __restrict__ p_wei_global, + Float* const __restrict__ p_out_global) const + { + // create a native tensor descriptor + constexpr auto in_c_h_w_n_global_desc = + make_native_tensor_descriptor(InGlobalDesc::GetLengths(), InGlobalDesc::GetStrides()); - p_out_global[0] = in_n_c_h_w_padded_global_desc.CalculateOffset({1, c, h, w}); + constexpr index_t C = in_c_h_w_n_global_desc.GetLength(I0); + constexpr index_t Hi = in_c_h_w_n_global_desc.GetLength(I1); + constexpr index_t Wi = in_c_h_w_n_global_desc.GetLength(I2); + constexpr index_t N = in_c_h_w_n_global_desc.GetLength(I3); + + // transformation: {c, h, w, n} --> {n, c, hp, wp} + // {h, w} --> {hp, wp}, {c} --> {c}, {n} --> {n} + constexpr auto in_c_hp_wp_n_global_desc = transform_tensor_descriptor( + in_c_h_w_n_global_desc, + make_tuple( + PassThrough{}, Pad, LeftPads, RightPads>{}, PassThrough{}), + make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}), + make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{})); + + if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) + { + // 0 + print_tensor_descriptor("in_c_h_w_n_global_desc", in_c_h_w_n_global_desc); + + // 1 + print_tensor_descriptor("in_c_hp_wp_n_global_desc", in_c_hp_wp_n_global_desc); + + constexpr auto idx1 = MultiIndex<4>{1, 2, 3, 4}; + auto idx0 = in_c_hp_wp_n_global_desc.CalculateLowerIndex(idx1); + + print_array("idx1: ", idx1); + print_array("idx0: ", idx0); + + auto coord1 = make_tensor_coordinate_v2(in_c_hp_wp_n_global_desc, idx1); + + print_array("1: ", coord1.GetIndex()); + print_array("0: ", coord1.GetLowerCoordinate().GetIndex()); + + printf("in_c_hp_wp_n_global_desc is_in_pad: %d\n", + coord1.IsAnyLevelIndexInPaddingArea()); + + printf("in_c_hp_wp_n_global_desc offset: %lu\n", + in_c_hp_wp_n_global_desc.CalculateOffset(idx1)); } -#endif -#endif } #endif }; diff --git a/composable_kernel/include/tensor_description/multi_index_transform.hpp b/composable_kernel/include/tensor_description/multi_index_transform.hpp index a89eb2dfb8..d26b4f1efa 100644 --- a/composable_kernel/include/tensor_description/multi_index_transform.hpp +++ b/composable_kernel/include/tensor_description/multi_index_transform.hpp @@ -22,17 +22,27 @@ struct PassThrough __host__ __device__ static constexpr auto GetUpperLengths() { return Sequence{}; } - __host__ __device__ static constexpr auto CalculateLowerIndex(UpperIndex idx_up) + __host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up) { return idx_up; } - __host__ __device__ static constexpr auto CalculateLowerIndexDiff(UpperIndex idx_up_diff) + __host__ __device__ static constexpr auto + CalculateLowerIndexDiff(const UpperIndex& idx_up_diff, + const UpperIndex& /* idx_up_old */, + const LowerIndex& /* idx_low_old */) { return idx_up_diff; } __host__ __device__ static constexpr bool IsLinearTransform() { return true; } + + // TODO: should this function be here? should it be specific for padding check? + __host__ __device__ static constexpr bool + IsUpperIndexInPaddingArea(const UpperIndex& /* idx_up */) + { + return false; + } }; // LowLengths: Sequence<...> @@ -55,17 +65,39 @@ struct Pad return GetLowerLengths() + LeftPads{} + RightPads{}; } - __host__ __device__ static constexpr auto CalculateLowerIndex(UpperIndex idx_up) + __host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up) { return idx_up - LeftPads{}; } - __host__ __device__ static constexpr auto CalculateLowerIndexDiff(UpperIndex idx_up_diff) + __host__ __device__ static constexpr auto + CalculateLowerIndexDiff(const UpperIndex& idx_up_diff, + const UpperIndex& /* idx_up_old */, + const LowerIndex& /* idx_low_old */) { return idx_up_diff; } __host__ __device__ static constexpr bool IsLinearTransform() { return true; } + + // TODO: should this function be here? should it be specific for padding check? + __host__ __device__ constexpr bool IsUpperIndexInPaddingArea(const UpperIndex& idx_up) const + { + bool flag = false; + + static_for<0, nDim, 1>{}([&](auto idim) { + // only check if there is left-padding + static_if<(LeftPads::At(idim) != 0)>{}( + [&](auto) { flag = flag || idx_up[idim] < LeftPads::At(idim); }); + + // only check if there is right-padding + static_if<(RightPads::At(idim) != 0)>{}([&](auto) { + flag = flag || idx_up[idim] >= LeftPads::At(idim) + LowLengths::At(idim); + }); + }); + + return flag; + } }; // LowLengths: Sequence<...> @@ -124,7 +156,7 @@ struct Merge .PushBack(Number<1>{}); // calculate index in each of the dimensions in the order of their dimension -#if 1 +#if 1 // would compile to same ISA? static_for<0, nDimLow - 1, 1>{}( lambda_CalculateLowerIndex(itmp, idx_low)); @@ -138,8 +170,10 @@ struct Merge } // idx_low_diff depends on idx_low_old, so idx_low need to be up-to-date - __host__ __device__ static constexpr auto CalculateLowerIndexDiff(const UpperIndex& idx_up_diff, - const LowerIndex& idx_low_old) + __host__ __device__ static constexpr auto + CalculateLowerIndexDiff(const UpperIndex& idx_up_diff, + const UpperIndex& /* idx_up_old */, + const LowerIndex& idx_low_old) { LowerIndex idx_low_diff; @@ -149,6 +183,13 @@ struct Merge } __host__ __device__ static constexpr bool IsLinearTransform() { return false; } + + // TODO: should this function be here? should it be specific for padding check? + __host__ __device__ static constexpr bool + IsUpperIndexInPaddingArea(const UpperIndex& /* idx_up */) + { + return false; + } }; // UpLengths: Sequence<...> @@ -189,7 +230,10 @@ struct Unmerge return idx_low; } - __host__ __device__ static constexpr auto CalculateLowerIndexDiff(const UpperIndex& idx_up_diff) + __host__ __device__ static constexpr auto + CalculateLowerIndexDiff(const UpperIndex& idx_up_diff, + const UpperIndex& /* idx_up_old */, + const LowerIndex& /* idx_low_old */) { return CalculateLowerIndex(idx_up_diff); } @@ -240,7 +284,10 @@ struct Embed return idx_low; } - __host__ __device__ static constexpr auto CalculateLowerIndexDiff(const UpperIndex& idx_up_diff) + __host__ __device__ static constexpr auto + CalculateLowerIndexDiff(const UpperIndex& idx_up_diff, + const UpperIndex& /* idx_up_old */, + const LowerIndex& /* idx_low_old */) { LowerIndex idx_low_diff{0}; diff --git a/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp b/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp index 7c8f3a390e..b7191f6c8b 100644 --- a/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp +++ b/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp @@ -3,26 +3,28 @@ #include "common_header.hpp" #include "dimension.hpp" -#include "dimension_transform.hpp" +#include "multi_index_transform.hpp" #include "tensor_descriptor.hpp" namespace ck { -template +template +struct TensorCoordinate_v2; + +template struct NativeTensorCoordinate { - using type = NativeTensorCoordinate; - using tensor_desc_type = NativeTensorDesc; - using Index = tensor_desc_type::Index; - - static constexpr index_t nDim = Index::GetSize(); + using type = NativeTensorCoordinate; + using tensor_desc_type = NativeTensorDesc; + static constexpr index_t nDim = tensor_desc_type::GetNumOfDimension(); + using Index = MultiIndex; __host__ __device__ constexpr NativeTensorCoordinate(Index idx) - : mOffset{GetTensorDesriptor().GetOffset(idx)} + : mIndex(idx), mOffset(tensor_desc_type::CalculateOffset(idx)) { } - template + template __host__ __device__ constexpr NativeTensorCoordinate(Xs... xs) : NativeTensorCoordinate(Index{xs...}) { @@ -36,82 +38,103 @@ struct NativeTensorCoordinate __host__ __device__ static constexpr auto GetTensorDescriptor() { return tensor_desc_type{}; } - __host__ __device__ constexpr index_t GetOffset() const { return mOffset; } + __host__ __device__ constexpr const Index& GetIndex() const { return mIndex; } - __host__ __device__ type operator+=(Index idx_diff) + __host__ __device__ constexpr const index_t& GetOffset() const { return mOffset; } + + __host__ __device__ constexpr type operator+=(const Index& idx_diff) { - mOffset += tensor_desc_type::GetOffsetDiff(idx_diff); + // mIndex is updated here, but some (or all) of its entries may never be used + mIndex += idx_diff; + + mOffset += tensor_desc_type::CalculateOffset(idx_diff); return *this; } - __host__ __device__ type operator-=(Index idx_diff) + __host__ __device__ constexpr type operator-=(const Index& idx_diff) { - mOffset -= tensor_desc_type::GetOffsetFromMultiIndex(idx_diff); + // mIndex is updated here, but some (or all) of its entries may never be used + mIndex -= idx_diff; + + mOffset -= tensor_desc_type::CalculateOffset(idx_diff); return *this; } - __host__ __device__ constexpr type operator+(Index idx_diff) const + __host__ __device__ constexpr type operator+(const Index& idx_diff) const { type coord = *this; coord += idx_diff; return coord; } - __host__ __device__ constexpr type operator-(Index idx_diff) const + __host__ __device__ constexpr type operator-(const Index& idx_diff) const { type coord = *this; coord -= idx_diff; return coord; } + // TODO: should this function be here? should it be specific for padding check? + __host__ __device__ static constexpr bool IsAnyLevelIndexInPaddingArea() { return false; } + private: + // mIndex may be saved and update, however, the value of some (or all) of its entries may + // never be used. Compiler should be able to remove these entries as well as its calculation + // as dead code. + // TODO: make sure compiler indeed remove these dead code + Index mIndex; index_t mOffset; }; -template +template struct TransformedTensorCoordinate { - using type = TransformedTensorCoordinate; using tensor_desc_type = TransformedTensorDesc; - using Index = tensor_desc_type::UpperIndex; + using LowerCoord = + typename TensorCoordinate_v2::type; + using UpperCoord = TransformedTensorCoordinate; + static constexpr index_t nDim = tensor_desc_type::GetNumOfDimension(); + using UpperIndex = MultiIndex; - using lower_coordinate_type = - TensorCoordiante_v2::type; - - static constexpr index_t nDim = Index::GetSize(); - - __host__ __device__ constexpr TransformedTensorCoordinate(Index idx) - : mIndex{idx}, mCoordLow{GetTensorDescriptor().GetLowerIndex(idx)} + __host__ __device__ constexpr TransformedTensorCoordinate(UpperIndex idx) + : mIndexUp{idx}, mCoordLow{tensor_desc_type::CalculateLowerIndex(idx)} { } - template + template __host__ __device__ constexpr TransformedTensorCoordinate(Xs... xs) - : TransformedTensorCoordinate(Index{xs...}) + : TransformedTensorCoordinate(UpperIndex{xs...}) { } template __host__ __device__ constexpr TransformedTensorCoordinate(Sequence) - : TransformedTensorCoordinate(Index{Xs...}) + : TransformedTensorCoordinate(UpperIndex{Xs...}) { } __host__ __device__ static constexpr auto GetTensorDescriptor() { return tensor_desc_type{}; } - __host__ __device__ constexpr index_t GetOffset() const { return mCoordLow.GetOffset(); } + __host__ __device__ constexpr const LowerCoord& GetLowerCoordinate() const { return mCoordLow; } - __host__ __device__ constexpr Index GetIndex() const { return mIndex; } + __host__ __device__ constexpr const UpperIndex& GetUpperIndex() const { return mIndexUp; } - __host__ __device__ type operator+=(Index idx_up_diff) + __host__ __device__ constexpr const UpperIndex& GetIndex() const { return GetUpperIndex(); } + + __host__ __device__ constexpr const index_t& GetOffset() const + { + return GetLowerCoordinate().GetOffset(); + } + + __host__ __device__ constexpr UpperCoord operator+=(const UpperIndex& idx_up_diff) { // For transformation of multi-index difference, not all transformation functions need to // know the old lower-index or the old upper-index. We pass both of them to the // transformation function. The transformation function itself decides to use them or not. - mCoordLow += - tensor_desc_type::GetLowerIndexDiff(idx_up_diff, mIndexUp, mCoordLow.GetIndex()); + mCoordLow += tensor_desc_type::CalculateLowerIndexDiff( + idx_up_diff, GetIndex(), GetLowerCoordinate().GetIndex()); // mIndexUp is updated here, but some (or all) of its entries may never be used mIndexUp += idx_up_diff; @@ -119,11 +142,35 @@ struct TransformedTensorCoordinate return *this; } - __host__ __device__ constexpr type operator+(Index idx_up_diff) const + __host__ __device__ constexpr UpperCoord operator-=(const UpperIndex& idx_up_diff) { - type coord = *this; - coord += idx_diff; - return coord; + mCoordLow -= tensor_desc_type::CalculateLowerIndexDiff( + idx_up_diff, GetIndex(), GetLowerCoordinate().GetIndex()); + + mIndexUp -= idx_up_diff; + + return *this; + } + + __host__ __device__ constexpr UpperCoord operator+(const UpperIndex& idx_up_diff) const + { + UpperCoord coord_up = *this; + coord_up += idx_up_diff; + return coord_up; + } + + __host__ __device__ constexpr UpperCoord operator-(const UpperIndex& idx_up_diff) const + { + UpperCoord coord_up = *this; + coord_up -= idx_up_diff; + return coord_up; + } + + // TODO: should this function be here? should it be specific for padding check? + __host__ __device__ constexpr bool IsAnyLevelIndexInPaddingArea() const + { + return tensor_desc_type::IsUpperIndexInPaddingArea(GetIndex()) || + mCoordLow.IsAnyLevelIndexInPaddingArea(); } private: @@ -131,22 +178,22 @@ struct TransformedTensorCoordinate // never be used. Compiler should be able to remove these entries as well as its calculation // as dead code. // TODO: make sure compiler indeed remove these dead code - Index mIndexUp; - lower_coordinate_type mCoordLow; + UpperIndex mIndexUp; + LowerCoord mCoordLow; }; -template +template struct TensorCoordinate_v2 { private: - template + template __host__ __device__ static constexpr auto MakeDummyTensorCoordinate(NativeTensorDescriptor) { return NativeTensorCoordinate>(); } - template + template __host__ __device__ static constexpr auto MakeDummyTensorCoordinate(TransformedTensorDescriptor) { @@ -156,5 +203,12 @@ struct TensorCoordinate_v2 public: using type = decltype(MakeDummyTensorCoordinate(TensorDesc{})); }; + +template +__host__ __device__ constexpr auto +make_tensor_coordinate_v2(TensorDesc, MultiIndex idx) +{ + return typename TensorCoordinate_v2::type(idx); +} } #endif diff --git a/composable_kernel/include/tensor_description/tensor_descriptor.hpp b/composable_kernel/include/tensor_description/tensor_descriptor.hpp index 8ecc2cdbcf..7f65912e6d 100644 --- a/composable_kernel/include/tensor_description/tensor_descriptor.hpp +++ b/composable_kernel/include/tensor_description/tensor_descriptor.hpp @@ -64,6 +64,18 @@ struct NativeTensorDescriptor return GetStrides(typename arithmetic_sequence_gen<0, nDim, 1>::type{}); } + __host__ __device__ static constexpr index_t GetElementSize() + { + return accumulate_on_sequence(GetLengths(), math::multiplies{}, Number<1>{}); + } + + __host__ __device__ static constexpr index_t GetElementSpace() + { + return accumulate_on_sequence( + (GetLengths() - Number<1>{}) * GetStrides(), math::plus{}, Number<1>{}); + } + + // TODO: this cannot return constepxr because of use of lambda __host__ __device__ static constexpr index_t CalculateOffset(const Index& idx) { index_t offset = 0; @@ -73,6 +85,12 @@ struct NativeTensorDescriptor return offset; } + // TODO: remove this + __host__ __device__ static constexpr index_t GetOffsetFromMultiIndex(const Index& idx) + { + return CalculateOffset(idx); + } + __host__ __device__ static constexpr index_t CalculateOffsetDiff(const Index& idx_diff) { index_t offset_diff = 0; @@ -100,6 +118,12 @@ struct NativeTensorDescriptor { return Tuple<>{}; } + + // TODO: should this function be here? should it be specific for padding check? + __host__ __device__ static constexpr bool IsUpperIndexInPaddingArea(const Index& /* idx */) + { + return false; + } }; // LowerTensorDescriptor @@ -248,6 +272,17 @@ struct TransformedTensorDescriptor return GetLengths(Sequence{}); } + __host__ __device__ static constexpr index_t GetElementSize() + { + return accumulate_on_sequence(GetLengths(), math::multiplies{}, Number<1>{}); + } + + __host__ __device__ static constexpr index_t GetElementSpace() + { + // TODO: Is this the correct definition for transformed tensor? + return GetLowerTensorDescriptor().GetElementSpace(); + } + // TODO: right now return value is constexpr because use of non-constepxr lambda __host__ __device__ static constexpr LowerIndex CalculateLowerIndex(const UpperIndex& idx_up) { @@ -256,8 +291,8 @@ struct TransformedTensorDescriptor static_for<0, nTransform, 1>{}([&](auto itran) { constexpr auto tran = Transforms{}.At(itran); - auto idx_low_part = pick_array_element(idx_low, LowDimensionIds{}.At(itran)); const auto idx_up_part = pick_array_element(idx_up, UpDimensionIds{}.At(itran)); + auto idx_low_part = pick_array_element(idx_low, LowDimensionIds{}.At(itran)); // this assume each lower (single) index is only assocaited with one transformation, // which is required for index transformation, and has been checked during constructor @@ -269,26 +304,29 @@ struct TransformedTensorDescriptor } // TODO: right now return value is constexpr because use of non-constepxr lambda - __host__ __device__ static constexpr LowerIndex - CalculateLowerIndexDiff(const UpperIndex& idx_up_diff, const LowerIndex& idx_low_old) + __host__ __device__ static constexpr LowerIndex CalculateLowerIndexDiff( + const UpperIndex& idx_up_diff, const UpperIndex& idx_up_old, const LowerIndex& idx_low_old) { LowerIndex idx_low_diff; static_for<0, nTransform, 1>{}([&](auto itran) { - constexpr auto tran = Transforms::At(itran); + constexpr auto tran = Transforms{}.At(itran); const auto idx_up_diff_part = - pick_array_element(idx_up_diff, UpDimensionIds::At(itran)); + pick_array_element(idx_up_diff, UpDimensionIds{}.At(itran)); - auto idx_low_diff_part = pick_array_element(idx_low_diff, LowDimensionIds::At(itran)); + const auto idx_up_old_part = pick_array_element(idx_up_old, UpDimensionIds{}.At(itran)); const auto idx_low_old_part = - pick_array_element(idx_low_old, LowDimensionIds::At(itran)); + pick_array_element(idx_low_old, LowDimensionIds{}.At(itran)); + + auto idx_low_diff_part = pick_array_element(idx_low_diff, LowDimensionIds{}.At(itran)); // this assume each lower (single) index is associated with only one transformation, // which is required for index transformation, and has been checked during constructor // of TransformedTensorDescriptor - idx_low_diff_part = tran.CalculateLowerIndex(idx_up_diff_part, idx_low_old_part); + idx_low_diff_part = tran.CalculateLowerIndexDiff( + to_array(idx_up_diff_part), to_array(idx_up_old_part), to_array(idx_low_old_part)); }); return idx_low_diff; @@ -299,6 +337,12 @@ struct TransformedTensorDescriptor return GetLowerTensorDescriptor().CalculateOffset(CalculateLowerIndex(idx_up)); } + // TODO: remove this + __host__ __device__ static constexpr index_t GetOffsetFromMultiIndex(const UpperIndex& idx_up) + { + return CalculateOffset(idx_up); + } + #if 0 template __host__ __device__ static constexpr bool IsLinearDimension(Number) @@ -321,6 +365,22 @@ struct TransformedTensorDescriptor // not implemented } #endif + + // TODO: should this function be here? should it be specific for padding check? + __host__ __device__ static constexpr bool IsUpperIndexInPaddingArea(const UpperIndex& idx_up) + { + bool flag = false; + + static_for<0, nTransform, 1>{}([&](auto itran) { + constexpr auto tran = Transforms{}.At(itran); + + const auto idx_up_part = pick_array_element(idx_up, UpDimensionIds{}.At(itran)); + + flag = flag || tran.IsUpperIndexInPaddingArea(to_array(idx_up_part)); + }); + + return flag; + } }; template @@ -337,7 +397,7 @@ __host__ __device__ constexpr auto make_native_tensor_descriptor_packed(Lengths) Lengths::PopFront(), math::multiplies{}, Number<1>{}) .PushBack(Number<1>{}); - return make_NativeTensorDescriptor(Lengths{}, strides); + return make_native_tensor_descriptor(Lengths{}, strides); } template ; - using ThreadwiseLoad = ThreadwiseGenericTensorSliceCopy_v3; + using ThreadwiseLoad = ThreadwiseGenericTensorSliceCopy_v3r1; - using ThreadwiseStore = ThreadwiseGenericTensorSliceCopy_v3; + using ThreadwiseStore = ThreadwiseGenericTensorSliceCopy_v3r1; data_type mpBuffer[ThreadBufferDesc::GetElementSpace()]; @@ -667,6 +671,125 @@ struct BlockwiseGenericTensorSliceCopy_v3 ThreadwiseStore mThreadwiseStore; }; +template +struct BlockwiseGenericTensorSliceCopy_v4 +{ + static constexpr index_t nDim = SrcDesc::GetNumOfDimension(); + + using SrcCoord = typename TensorCoordinate_v2::type; + using DstCoord = typename TensorCoordinate_v2::type; + + __device__ constexpr BlockwiseGenericTensorSliceCopy_v4(SrcCoord src_block_slice_origin, + DstCoord dst_block_slice_origin) + { + static_assert(nDim == SrcDesc::GetNumOfDimension() && + nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::Size() && + nDim == SubLengths::Size() && nDim == ThreadClusterLengths::Size() && + nDim == ThreadClusterArrangeOrder::Size() && + nDim == SrcDimAccessOrder::Size() && nDim == DstDimAccessOrder::Size(), + "wrong! nDim not consistent"); + + static_assert(is_same{}, + "wrong! threads should be mapped to cover entire slicing window"); + + constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_packed( + ThreadClusterLengths::ReorderGivenNew2Old(ThreadClusterArrangeOrder{})); + + static_assert(BlockSize == thread_cluster_desc.GetElementSize(), + "wrong! BlockSize not consistent with ThreadClusterLengths"); + + const auto thread_cluster_id = + thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id()); + + const auto data_cluster_id = + reorder_array_given_old2new(thread_cluster_id, ThreadClusterArrangeOrder{}); + + const auto thread_data_id_begin = data_cluster_id * SubLengths{}; + + mThreadwiseLoad.SetSrcSliceOrigin(src_block_slice_origin + thread_data_id_begin); + mThreadwiseLoad.SetDstSliceOrigin(make_zero_array()); + + mThreadwiseStore.SetSrcSliceOrigin(make_zero_array()); + mThreadwiseStore.SetDstSliceOrigin(dst_block_slice_origin + thread_data_id_begin); + } + + __device__ static constexpr index_t GetRegisterBufferSize() + { + return RegisterBufferDesc::GetElementSpace(); + } + + template + __device__ void RunLoadRegisterBuffer(const TData* p_src, TData* p_buffer) const + { + mThreadwiseLoad.Run(p_src, p_buffer); + } + + template + __device__ void RunStoreRegisterBuffer(const TData* p_buffer, TData* p_dst) const + { + mThreadwiseStore.Run(p_buffer, p_dst); + } + + template + __device__ void Run(const TData* p_src, TData* p_dst) const + { + TData p_buffer[GetRegisterBufferSize()]; + + mThreadwiseLoad.Run(p_src, p_buffer); + mThreadwiseStore.Run(p_buffer, p_dst); + } + + template + __device__ void + MoveSrcSlicingWindow(T step_sizes, + integral_constant positive_direction) + { + mThreadwiseLoad.MoveSrcSlicingWindow(step_sizes, positive_direction); + } + + template + __device__ void + MoveDstSlicingWindow(T step_sizes, + integral_constant positive_direction) + { + mThreadwiseStore.MoveDstSlicingWindow(step_sizes, positive_direction); + } + + private: + using RegisterBufferDesc = decltype(make_native_tensor_descriptor_packed(SubLengths{})); + + using ThreadwiseLoad = ThreadwiseGenericTensorSliceCopy_v4r2; + + using ThreadwiseStore = ThreadwiseGenericTensorSliceCopy_v4r2; + + ThreadwiseLoad mThreadwiseLoad; + ThreadwiseStore mThreadwiseStore; +}; + } // namespace ck #endif diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp index c0928c2bd3..3fba453a36 100644 --- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp @@ -6,6 +6,8 @@ #include "ConstantMergedTensorDescriptor.hpp" #include "tensor_coordinate.hpp" #include "tensor_view.hpp" +#include "tensor_descriptor.hpp" +#include "tensor_coordinate_v2.hpp" #ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0 @@ -427,6 +429,7 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2 Array mDstSliceOrigin; }; +// This version use TensorCoordinate // This threadwise copy allow vector access of src and dst. // It allows the dimensions of vector access to be different on src and dst. // It also allows the vector size to be different on src and dst. @@ -774,6 +777,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1 DstCoordinate mDstSliceOrigin; }; +// this version use TensorView and TensorCoordinate template -struct ThreadwiseGenericTensorSliceCopy_v3 +struct ThreadwiseGenericTensorSliceCopy_v3r1 { static constexpr index_t nDim = SrcTensor::GetNumOfDimension(); using data_type = remove_cv_t; @@ -791,10 +795,10 @@ struct ThreadwiseGenericTensorSliceCopy_v3 using SrcCoordinate = typename SrcTensor::coordinate_type; using DstCoordinate = typename DstTensor::coordinate_type; - __device__ constexpr ThreadwiseGenericTensorSliceCopy_v3(SrcTensor src, - SrcCoordinate src_slice_origin, - DstTensor dst, - DstCoordinate dst_slice_origin) + __device__ constexpr ThreadwiseGenericTensorSliceCopy_v3r1(SrcTensor src, + SrcCoordinate src_slice_origin, + DstTensor dst, + DstCoordinate dst_slice_origin) : mSrc{src}, mDst{dst}, mSrcSlice{src.Slice(src_slice_origin, SliceLengths{})}, @@ -821,8 +825,8 @@ struct ThreadwiseGenericTensorSliceCopy_v3 "wrong! vectorized access is not allowed"); } - __device__ constexpr ThreadwiseGenericTensorSliceCopy_v3() - : ThreadwiseGenericTensorSliceCopy_v3( + __device__ constexpr ThreadwiseGenericTensorSliceCopy_v3r1() + : ThreadwiseGenericTensorSliceCopy_v3r1( SrcTensor{}, SrcCoordinate{}, DstTensor{}, DstCoordinate{}) { } @@ -940,5 +944,154 @@ struct ThreadwiseGenericTensorSliceCopy_v3 DstSlice mDstSlice; }; +// This version use multi-index transformation +// This threadwise copy allow vector access of src and dst. +// It allows the vector size to be different on src and dst. +// The dimensions of vector access should be the same on src and dst. +// The dimension access order should be the same on src and dst. +// It is designed for cases, where one of src and dst is register, and +// the other is device memory or LDS +template +struct ThreadwiseGenericTensorSliceCopy_v4r2 +{ + static constexpr index_t nDim = SliceLengths::Size(); + using Index = MultiIndex; + + using SrcCoord = typename TensorCoordinate_v2::type; + using DstCoord = typename TensorCoordinate_v2::type; + + __device__ constexpr ThreadwiseGenericTensorSliceCopy_v4r2(SrcCoord src_slice_origin, + DstCoord dst_slice_origin) + : mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin) + { + static_assert(nDim == SrcDesc::GetNumOfDimension() && + nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::Size() && + nDim == DimAccessOrder::Size(), + "wrong! # of dimensions not the same"); + + static_assert(is_valid_sequence_map{}, "wrong! map is not valid"); + + static_assert( + SliceLengths{}[VectorAccessDim] % math::lcm(SrcDataPerAccess, DstDataPerAccess) == 0, + "wrong! cannot evenly divide"); + + // TODO:: sanity-check if vectorized memory access is allowed on src and dst + } + + __device__ constexpr ThreadwiseGenericTensorSliceCopy_v4r2() + : ThreadwiseGenericTensorSliceCopy_v4r2(make_zero_array(), + make_zero_array()) + { + } + + __device__ void SetSrcSliceOrigin(SrcCoord src_slice_origin) + { + mSrcSliceOrigin = src_slice_origin; + } + + __device__ void SetDstSliceOrigin(DstCoord dst_slice_origin) + { + mDstSliceOrigin = dst_slice_origin; + } + + template + __device__ void Run(const TData* p_src, TData* p_dst) const + { + using src_vector_t = typename vector_type::MemoryType; + using dst_vector_t = typename vector_type::MemoryType; + + constexpr auto vector_access_dim = Number{}; + + constexpr auto src_data_per_access = Number{}; + constexpr auto dst_data_per_access = Number{}; + + constexpr auto long_vector_size = Number{}; + + constexpr auto long_vector_access_lengths = SliceLengths::Modify( + vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size); + + ford{}([&]( + auto long_vector_access_id) { + + // data id w.r.t slicing-window + auto long_vector_data_begin_id = long_vector_access_id; + long_vector_data_begin_id(vector_access_dim) = + long_vector_size * long_vector_access_id[vector_access_dim]; + + // buffer to hold a long-vector + TData p_long_vector[long_vector_size]; + + // set 0 + for(index_t i = 0; i < long_vector_size; ++i) + { + p_long_vector[i] = 0; + } + + // load data from src to the long-vector buffer + for(index_t i = 0; i < long_vector_size / src_data_per_access; ++i) + { + auto scalar_id = make_zero_array(); + scalar_id(vector_access_dim) = i * src_data_per_access; + + const auto src_coord = mSrcSliceOrigin + (long_vector_data_begin_id + scalar_id); + + // check for padding + // TODO: still kind of messy + if(!src_coord.IsAnyLevelIndexInPaddingArea()) + { + const index_t src_offset = + (mSrcSliceOrigin + (long_vector_data_begin_id + scalar_id)).GetOffset(); + + const index_t buffer_offset = i * src_data_per_access; + + *reinterpret_cast(&p_long_vector[buffer_offset]) = + *reinterpret_cast(&p_src[src_offset]); + } + } + + // store data from the long-vector buffer to dst + for(index_t i = 0; i < long_vector_size / dst_data_per_access; ++i) + { + auto scalar_id = make_zero_array(); + scalar_id(vector_access_dim) = i * dst_data_per_access; + + const index_t buffer_offset = i * dst_data_per_access; + + const index_t dst_offset = + (mDstSliceOrigin + (long_vector_data_begin_id + scalar_id)).GetOffset(); + + *reinterpret_cast(&p_dst[dst_offset]) = + *reinterpret_cast(&p_long_vector[buffer_offset]); + } + }); + } + + template + __device__ void MoveSrcSlicingWindow(T step_sizes, integral_constant) + { + static_if{}([&](auto) { + mSrcSliceOrigin += step_sizes; + }).Else([&](auto) { mSrcSliceOrigin -= step_sizes; }); + } + + template + __device__ void MoveDstSlicingWindow(T step_sizes, integral_constant) + { + static_if{}([&](auto) { + mDstSliceOrigin += step_sizes; + }).Else([&](auto) { mDstSliceOrigin -= step_sizes; }); + } + + private: + SrcCoord mSrcSliceOrigin; + DstCoord mDstSliceOrigin; +}; + } // namespace ck #endif diff --git a/composable_kernel/include/utility/array.hpp b/composable_kernel/include/utility/array.hpp index 52e92da3f1..7b0c25d41b 100644 --- a/composable_kernel/include/utility/array.hpp +++ b/composable_kernel/include/utility/array.hpp @@ -23,20 +23,9 @@ struct Array static_assert(sizeof...(Xs) + 1 == NSize, "wrong! size"); } -#if 0 - template - __host__ __device__ explicit constexpr Array(const T& x) - { - static_assert(T::Size() == NSize, "wrong! size"); - - static_for<0, NSize, 1>{}([&](auto i){ - mData[i] = x.At(i); - }) - } -#endif - __host__ __device__ static constexpr index_t Size() { return NSize; } + // TODO: remove __host__ __device__ static constexpr index_t GetSize() { return Size(); } template @@ -265,8 +254,8 @@ __host__ __device__ constexpr auto extract_array(const Array& old_ return new_array; } -template // emulate constepxr lambda for array -// math +// emulate constepxr lambda for array +template struct lambda_array_math { const F& f; diff --git a/composable_kernel/include/utility/array_helper.hpp b/composable_kernel/include/utility/array_helper.hpp index a3536309fa..7608a195a3 100644 --- a/composable_kernel/include/utility/array_helper.hpp +++ b/composable_kernel/include/utility/array_helper.hpp @@ -5,8 +5,8 @@ namespace ck { -template -__host__ __device__ void print_array(const char* s, Array a) +template +__host__ __device__ void print_array(const char* s, Array a) { constexpr index_t nsize = a.GetSize(); @@ -89,5 +89,89 @@ __host__ __device__ void print_array(const char* s, Array a) }); } +template +__host__ __device__ void print_array(const char* s, Array a) +{ + constexpr index_t nsize = a.GetSize(); + + static_assert(nsize > 0 && nsize <= 10, "wrong!"); + + static_if{}([&](auto) { printf("%s size %d, {%d}\n", s, nsize, a[0]); }); + + static_if{}([&](auto) { printf("%s size %d, {%d %d}\n", s, nsize, a[0], a[1]); }); + + static_if{}( + [&](auto) { printf("%s size %d, {%d %d %d}\n", s, nsize, a[0], a[1], a[2]); }); + + static_if{}( + [&](auto) { printf("%s size %d, {%d %d %d %d}\n", s, nsize, a[0], a[1], a[2], a[3]); }); + + static_if{}([&](auto) { + printf("%s size %d, {%d %d %d %d %d}\n", s, nsize, a[0], a[1], a[2], a[3], a[4]); + }); + + static_if{}([&](auto) { + printf("%s size %d, {%d %d %d %d %d %d}\n", s, nsize, a[0], a[1], a[2], a[3], a[4], a[5]); + }); + + static_if{}([&](auto) { + printf("%s size %d, {%d %d %d %d %d %d %d}\n", + s, + nsize, + a[0], + a[1], + a[2], + a[3], + a[4], + a[5], + a[6]); + }); + + static_if{}([&](auto) { + printf("%s size %d, {%d %d %d %d %d %d %d %d}\n", + s, + nsize, + a[0], + a[1], + a[2], + a[3], + a[4], + a[5], + a[6], + a[7]); + }); + + static_if{}([&](auto) { + printf("%s size %d, {%d %d %d %d %d %d %d %d %d}\n", + s, + nsize, + a[0], + a[1], + a[2], + a[3], + a[4], + a[5], + a[6], + a[7], + a[8]); + }); + + static_if{}([&](auto) { + printf("%s size %d, {%d %d %d %d %d %d %d %d %d %d}\n", + s, + nsize, + a[0], + a[1], + a[2], + a[3], + a[4], + a[5], + a[6], + a[7], + a[8], + a[9]); + }); +} + } // namespace ck #endif diff --git a/composable_kernel/include/utility/config_nvidia.hpp.in b/composable_kernel/include/utility/config_nvidia.hpp.in index 0a4b43d1a6..05be2489bb 100644 --- a/composable_kernel/include/utility/config_nvidia.hpp.in +++ b/composable_kernel/include/utility/config_nvidia.hpp.in @@ -15,6 +15,15 @@ namespace ck { +using unsigned_t = uint32_t; +using signed_t = int; + +#if 0 // debug +using index_t = unsigned_t; +#else +using index_t = signed_t; +#endif + // For some reason, CUDA need this definition, otherwise // compiler won't generate optimal load and store instruction, and // kernel would produce wrong result, indicating the compiler fail to generate correct @@ -22,8 +31,6 @@ namespace ck { using float2_t = float2; using float4_t = float4; -using index_t = uint32_t; - template __device__ void fused_multiply_accumulate(T& d, const T& s0, const T& s1) { diff --git a/composable_kernel/include/utility/sequence.hpp b/composable_kernel/include/utility/sequence.hpp index 8a9fff5979..37754cca20 100644 --- a/composable_kernel/include/utility/sequence.hpp +++ b/composable_kernel/include/utility/sequence.hpp @@ -537,11 +537,9 @@ struct sequence_unique_sort }; template -struct is_valid_sequence_map +struct is_valid_sequence_map : is_same::type, + typename sequence_sort>::type> { - static constexpr bool value = - is_same::type, - typename sequence_sort>::type>{}; }; template diff --git a/driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded.hpp b/driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded.hpp index 857924db25..c66f5f2bfc 100644 --- a/driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded.hpp +++ b/driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded.hpp @@ -115,7 +115,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn_padded(InDesc, constexpr index_t OutThreadCopyDataPerAccess_N = 4; #endif -#if 0 // debug +#if 1 // debug constexpr index_t GridSize = (N / NPerBlock) * (K / KPerBlock) * (Ho / HoPerBlock) * (Wo / WoPerBlock); #else diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp index b20fc26f78..00ce2079f6 100644 --- a/driver/src/driver.cpp +++ b/driver/src/driver.cpp @@ -73,25 +73,10 @@ int main(int argc, char* argv[]) using namespace ck; #if 1 - constexpr index_t N = 10; - constexpr index_t C = 10; - constexpr index_t HI = 10; - constexpr index_t WI = 10; - constexpr index_t K = 10; - constexpr index_t Y = 1; - constexpr index_t X = 1; - - using ConvStrides = Sequence<1, 1>; - using ConvDilations = Sequence<1, 1>; - - constexpr index_t HPad = 3; - constexpr index_t WPad = 3; -#elif 1 - // 3x3, 34x34 - constexpr index_t N = 64; - constexpr index_t C = 256; - constexpr index_t HI = 34; - constexpr index_t WI = 34; + constexpr index_t N = 32; + constexpr index_t C = 8; + constexpr index_t HI = 2; + constexpr index_t WI = 2; constexpr index_t K = 128; constexpr index_t Y = 3; constexpr index_t X = 3; @@ -99,8 +84,23 @@ int main(int argc, char* argv[]) using ConvStrides = Sequence<1, 1>; using ConvDilations = Sequence<1, 1>; - constexpr index_t HPad = 0; - constexpr index_t WPad = 0; + constexpr index_t HPad = 1; + constexpr index_t WPad = 1; +#elif 1 + // 3x3, 34x34 + constexpr index_t N = 64; + constexpr index_t C = 256; + constexpr index_t HI = 32; + constexpr index_t WI = 32; + constexpr index_t K = 128; + constexpr index_t Y = 3; + constexpr index_t X = 3; + + using ConvStrides = Sequence<1, 1>; + using ConvDilations = Sequence<1, 1>; + + constexpr index_t HPad = 1; + constexpr index_t WPad = 1; #elif 0 // 1x1 filter, 8x8 image // cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42% @@ -434,7 +434,7 @@ int main(int argc, char* argv[]) if(do_verification) { -#if 1 +#if 0 if(Y == 3 && X == 3 && ConvStrides{}[0] == 1 && ConvStrides{}[1] == 1 && ConvDilations{}[0] == 1 && ConvDilations{}[1] == 1) {