From d4878d99f9c00746526cea4bff5be57966f92833 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Fri, 13 Sep 2019 23:30:48 -0500 Subject: [PATCH] initial padding support for nchw --- ...plicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp | 264 ++++++++++++++---- .../ConstantMatrixDescriptor.hpp | 16 +- .../ConstantTensorDescriptor.hpp | 14 +- .../multi_index_transform.hpp | 152 ++++++---- .../tensor_description/tensor_descriptor.hpp | 32 +-- .../tensor_descriptor_helper.hpp | 90 ++++++ .../threadwise_generic_tensor_slice_copy.hpp | 8 +- driver/src/driver.cpp | 8 +- 8 files changed, 429 insertions(+), 155 deletions(-) diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp index e987161395..cde9ce509a 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp @@ -51,6 +51,7 @@ template struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded { +#if 1 __device__ void Run(const Float* const __restrict__ p_in_global, const Float* const __restrict__ p_wei_global, Float* const __restrict__ p_out_global) const @@ -69,20 +70,24 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded constexpr auto I1 = Number<1>{}; constexpr auto I2 = Number<2>{}; constexpr auto I3 = Number<3>{}; - constexpr auto I5 = Number<5>{}; constexpr auto True = integral_constant{}; - constexpr auto in_n_c_h_w_global_desc = InGlobalDesc{}; - constexpr auto wei_k_c_y_x_global_desc = WeiGlobalDesc{}; - constexpr auto out_n_k_h_w_global_desc = OutGlobalDesc{}; + constexpr auto in_n_c_hi_wi_global_desc = + make_native_tensor_descriptor(InGlobalDesc::GetLengths(), InGlobalDesc::GetStrides()); + constexpr auto wei_k_c_y_x_global_desc = + make_native_tensor_descriptor(WeiGlobalDesc::GetLengths(), WeiGlobalDesc::GetStrides()); + constexpr auto out_n_k_ho_wo_global_desc = + make_native_tensor_descriptor(OutGlobalDesc::GetLengths(), OutGlobalDesc::GetStrides()); - constexpr index_t N = in_n_c_h_w_global_desc.GetLength(I0); - constexpr index_t C = in_n_c_h_w_global_desc.GetLength(I1); + constexpr index_t N = in_n_c_hi_wi_global_desc.GetLength(I0); + constexpr index_t C = in_n_c_hi_wi_global_desc.GetLength(I1); + constexpr index_t Hi = in_n_c_hi_wi_global_desc.GetLength(I2); + constexpr index_t Wi = in_n_c_hi_wi_global_desc.GetLength(I3); - constexpr index_t K = out_n_k_h_w_global_desc.GetLength(I1); - constexpr index_t Ho = out_n_k_h_w_global_desc.GetLength(I2); - constexpr index_t Wo = out_n_k_h_w_global_desc.GetLength(I3); + constexpr index_t K = out_n_k_ho_wo_global_desc.GetLength(I1); + constexpr index_t Ho = out_n_k_ho_wo_global_desc.GetLength(I2); + constexpr index_t Wo = out_n_k_ho_wo_global_desc.GetLength(I3); constexpr index_t Y = wei_k_c_y_x_global_desc.GetLength(I2); constexpr index_t X = wei_k_c_y_x_global_desc.GetLength(I3); @@ -126,30 +131,35 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded const index_t b_block_data_on_global = block_work_multi_id[1] * BPerBlock; // input tensor - // tensor descriptor in device memory [N0, N1, N2, Ho, Wo] - constexpr auto in_n0_n1_n2_h_w_global_desc = - in_n_c_h_w_global_desc.StridedSlice(I2, Number{}, Number{}) - .StridedSlice(I3, Number{}, Number{}) - .Fold(I0, Number{}, Number{}) - .Extract(Sequence<0, 1, 2, 4, 5>{}); + // global memory + constexpr auto in_n_c_hip_wip_global_desc = transform_tensor_descriptor( + in_n_c_hi_wi_global_desc, + make_tuple( + PassThrough{}, PassThrough{}, Pad, LeftPads, RightPads>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{})); - // batch descritpor for device memory - constexpr auto in_c_y_x_global_desc = - in_n_c_h_w_global_desc.StridedSlice(I2, Number{}, Number{}) - .StridedSlice(I3, Number{}, Number{}) - .Extract(Sequence<1, 2, 3>{}); + constexpr auto in_n0_n1_n2_c_y_ho_x_wo_global_desc = transform_tensor_descriptor( + in_n_c_hip_wip_global_desc, + make_tuple(Unmerge>{}, + PassThrough{}, + Embed, Sequence>{}, + Embed, Sequence>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), + make_tuple(Sequence<0, 1, 2>{}, Sequence<3>{}, Sequence<4, 5>{}, Sequence<6, 7>{})); - // merged tensor descriptor in device memory [E, N1, B, N2], src of blockwise copy - constexpr auto in_e_n1_b_n2_global_merged_desc = make_ConstantMergedTensorDescriptor( - in_c_y_x_global_desc.Embed(in_n0_n1_n2_h_w_global_desc), - Sequence<0, 1, 2>{}, - Sequence<4>{}, - Sequence<3, 6, 7>{}, - Sequence<5>{}); + constexpr auto in_e_n1_b_n2_global_desc = transform_tensor_descriptor( + in_n0_n1_n2_c_y_ho_x_wo_global_desc, + make_tuple(Merge>{}, + PassThrough{}, + Merge>{}, + PassThrough{}), + make_tuple(Sequence<3, 4, 6>{}, Sequence<1>{}, Sequence<0, 5, 7>{}, Sequence<2>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); // memory layout descriptor in LDS [E, N1, B, N2], dst of blockwise copy // be careful of LDS alignment - constexpr auto in_e_n1_b_n2_block_desc = make_ConstantTensorDescriptor_aligned( + constexpr auto in_e_n1_b_n2_block_desc = make_native_tensor_descriptor_aligned( Sequence{}, Number{}); // this check is ad-hoc @@ -162,8 +172,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded // slice a merged tensor, reorder and copy to a normal tensor // this copy operator already has blockwise offset built-in auto blockwise_in_copy = - BlockwiseGenericTensorSliceCopy_v2{}); + transform_tensor_descriptor(wei_k_c_y_x_global_desc, + make_tuple(Merge>{}, PassThrough{}), + make_tuple(Sequence<1, 2, 3>{}, Sequence<0>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); // tensor descriptor in LDS, dst of blockwise copy // be careful of LDS alignment - constexpr auto wei_e_k_block_desc = make_ConstantTensorDescriptor_aligned( + constexpr auto wei_e_k_block_desc = make_native_tensor_descriptor_aligned( Sequence{}, Number{}); @@ -192,7 +205,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded // slice a tensor, and copy it into another tensor // this copy operator already have blockwise offset built-in auto blockwise_wei_copy = - BlockwiseGenericTensorSliceCopy_v2{}); // output tensor descriptor in register, src of threadwise copy - constexpr auto out_n0_n1_n2_k0_k1_k2_h_w_thread_desc = - out_k0_k1_k2_n1_n0_h_w_n2_thread_mem_desc.ReorderGivenNew2Old( - Sequence<4, 3, 7, 0, 1, 2, 5, 6>{}); + constexpr auto out_n0_n1_n2_k0_k1_k2_ho_wo_thread_desc = + reorder_tensor_descriptor_given_upper2lower(out_k0_k1_k2_n1_n0_ho_wo_n2_thread_desc, + Sequence<4, 3, 7, 0, 1, 2, 5, 6>{}); // output memory layout descriptor in device memory, dst of threadwise copy - constexpr auto out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc = - out_n_k_h_w_global_desc.Fold(I1, Number{}, Number{}) - .Fold(I0, Number{}, Number{}); + constexpr auto out_n0_n1_n2_k0_k1_k2_ho_wo_global_desc = transform_tensor_descriptor( + out_n_k_ho_wo_global_desc, + make_tuple(Unmerge>{}, + Unmerge>{}, + PassThrough{}, + PassThrough{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), + make_tuple(Sequence<0, 1, 2>{}, Sequence<3, 4, 5>{}, Sequence<6>{}, Sequence<7>{})); // calculate origin of thread output tensor on global memory // blockwise GEMM c matrix starting index @@ -317,32 +340,159 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded // output merged global tensor descriptor, for calculating origin of thread tensor // in global memory - constexpr auto out_k_n1_b_n2_global_merged_desc = make_ConstantMergedTensorDescriptor( - out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc.Unfold(I3, I5), - Sequence<3>{}, - Sequence<1>{}, - Sequence<0, 4, 5>{}, - Sequence<2>{}); + constexpr auto out_n0_n1_n2_k_ho_wo_global_desc = transform_tensor_descriptor( + out_n_k_ho_wo_global_desc, + make_tuple(Unmerge>{}, + PassThrough{}, + PassThrough{}, + PassThrough{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), + make_tuple(Sequence<0, 1, 2>{}, Sequence<3>{}, Sequence<4>{}, Sequence<5>{})); + + constexpr auto out_k_n1_b_n2_global_desc = transform_tensor_descriptor( + out_n0_n1_n2_k_ho_wo_global_desc, + make_tuple(PassThrough{}, + PassThrough{}, + Merge>{}, + PassThrough{}), + make_tuple(Sequence<3>{}, Sequence<1>{}, Sequence<0, 4, 5>{}, Sequence<2>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); // origin of dst in device memory Float* p_out_thread_on_global = p_out_global + - out_k_n1_b_n2_global_merged_desc.GetOffsetFromMultiIndex( - k_thread_data_on_global, 0, b_thread_data_on_global, 0); + out_k_n1_b_n2_global_desc.CalculateOffset( + {k_thread_data_on_global, 0, b_thread_data_on_global, 0}); - ThreadwiseGenericTensorSliceCopy_v2r1< - decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc), - decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc), - decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths()), + ThreadwiseGenericTensorSliceCopy_v4r2< + decltype(out_n0_n1_n2_k0_k1_k2_ho_wo_thread_desc), + decltype(out_n0_n1_n2_k0_k1_k2_ho_wo_global_desc), + decltype(out_n0_n1_n2_k0_k1_k2_ho_wo_thread_desc.GetLengths()), arithmetic_sequence_gen<0, 8, 1>::type, - arithmetic_sequence_gen<0, 8, 1>::type, - 7, 7, 1, 1>({0, 0, 0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0, 0, 0}) .Run(p_out_thread, p_out_thread_on_global); } } +#else + __device__ void Run(const Float* const __restrict__ p_in_global, + const Float* const __restrict__ p_wei_global, + Float* const __restrict__ p_out_global) const + { + // this is a mess + // TODO: find more elegent way of specifying (or calculating) performance parameters + constexpr index_t N1 = GemmNRepeat; + constexpr index_t N2 = GemmNPerThreadSubC; + + static_assert((N1 * N2 * BPerBlock) % + (GemmNPerThreadSubC * GemmNLevel0Cluster * GemmNLevel1Cluster) == + 0, + "wrong!"); + + constexpr auto I0 = Number<0>{}; + constexpr auto I1 = Number<1>{}; + constexpr auto I2 = Number<2>{}; + constexpr auto I3 = Number<3>{}; + constexpr auto I5 = Number<5>{}; + + constexpr auto True = integral_constant{}; + + constexpr auto in_n_c_h_w_global_desc = + make_native_tensor_descriptor(InGlobalDesc::GetLengths(), InGlobalDesc::GetStrides()); + constexpr auto wei_k_c_y_x_global_desc = + make_native_tensor_descriptor(WeiGlobalDesc::GetLengths(), WeiGlobalDesc::GetStrides()); + constexpr auto out_n_k_h_w_global_desc = + make_native_tensor_descriptor(OutGlobalDesc::GetLengths(), OutGlobalDesc::GetStrides()); + + constexpr index_t N = in_n_c_h_w_global_desc.GetLength(I0); + constexpr index_t C = in_n_c_h_w_global_desc.GetLength(I1); + constexpr index_t Hi = in_n_c_h_w_global_desc.GetLength(I2); + constexpr index_t Wi = in_n_c_h_w_global_desc.GetLength(I3); + + constexpr index_t K = out_n_k_h_w_global_desc.GetLength(I1); + constexpr index_t Ho = out_n_k_h_w_global_desc.GetLength(I2); + constexpr index_t Wo = out_n_k_h_w_global_desc.GetLength(I3); + + constexpr index_t Y = wei_k_c_y_x_global_desc.GetLength(I2); + constexpr index_t X = wei_k_c_y_x_global_desc.GetLength(I3); + + constexpr index_t ConvStrideH = ConvStrides{}[0]; + constexpr index_t ConvStrideW = ConvStrides{}[1]; + + constexpr index_t ConvDilationH = ConvDilations{}[0]; + constexpr index_t ConvDilationW = ConvDilations{}[1]; + + static_assert(N % (N1 * N2) == 0, "wrong! cannot divice N evenly among thread"); + + constexpr index_t N0 = N / (N1 * N2); + + constexpr index_t B = N0 * Ho * Wo; + + constexpr index_t E = C * Y * X; + + // sanity-check for vectorized memory load + static_assert(ConvStrideW == 1 || InBlockCopySrcDataPerRead_B == 1, + "wrong! global vector load of input tensor is wrong"); + + static_assert((X == 1 || ConvDilationW % InBlockCopySrcDataPerRead_B == 0), + "wrong! aligment requirement for vectorized global load of input tensor will " + "be violated"); + + constexpr auto in_n_c_hi_wi_global_desc = + make_native_tensor_descriptor(InGlobalDesc::GetLengths(), InGlobalDesc::GetStrides()); + + constexpr auto in_n_c_hip_wip_global_desc = transform_tensor_descriptor( + in_n_c_hi_wi_global_desc, + make_tuple( + PassThrough{}, PassThrough{}, Pad, LeftPads, RightPads>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{})); + + constexpr auto in_n0_n1_n2_c_y_ho_x_wo_global_desc = transform_tensor_descriptor( + in_n_c_hip_wip_global_desc, + make_tuple(Unmerge>{}, + PassThrough{}, + Embed, Sequence>{}, + Embed, Sequence>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), + make_tuple(Sequence<0, 1, 2>{}, Sequence<3>{}, Sequence<4, 5>{}, Sequence<6, 7>{})); + + constexpr auto in_e_n1_b_n2_global_desc = transform_tensor_descriptor( + in_n0_n1_n2_c_y_ho_x_wo_global_desc, + make_tuple(Merge>{}, + PassThrough{}, + Merge>{}, + PassThrough{}), + make_tuple(Sequence<3, 4, 6>{}, Sequence<1>{}, Sequence<0, 5, 7>{}, Sequence<2>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); + +#if 0 + if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) + { + print_tensor_descriptor("in_n_c_hi_wi_global_desc: ", in_n_c_hi_wi_global_desc); + print_tensor_descriptor("in_n_c_hip_wip_global_desc: ", in_n_c_hip_wip_global_desc); + print_tensor_descriptor("in_n0_n1_n2_c_y_ho_x_wo_global_desc: ", + in_n0_n1_n2_c_y_ho_x_wo_global_desc); + print_tensor_descriptor("in_e_n1_b_n2_global_desc: ", in_e_n1_b_n2_global_desc); + + auto coord3 = make_tensor_coordinate_v2(in_e_n1_b_n2_global_desc, {1, 1, 1, 1}); + + auto idx3 = coord3.GetIndex(); + auto idx2 = coord3.GetLowerCoordinate().GetIndex(); + auto idx1 = coord3.GetLowerCoordinate().GetLowerCoordinate().GetIndex(); + auto idx0 = + coord3.GetLowerCoordinate().GetLowerCoordinate().GetLowerCoordinate().GetIndex(); + + print_array("idx3: ", idx3); + print_array("idx2: ", idx2); + print_array("idx1: ", idx1); + print_array("idx0: ", idx0); + } +#endif + p_out_global[0] = in_e_n1_b_n2_global_desc.CalculateOffset({0, 0, 10, 0}); + } +#endif }; } // namespace ck diff --git a/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp b/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp index 0ac9d4083c..f2f842e113 100644 --- a/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp +++ b/composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp @@ -3,6 +3,7 @@ #include "common_header.hpp" #include "ConstantTensorDescriptor.hpp" +#include "tensor_descriptor.hpp" namespace ck { @@ -52,7 +53,7 @@ __host__ __device__ constexpr auto return ConstantMatrixDescriptor{}; } -template +template __host__ __device__ constexpr auto make_ConstantMatrixDescriptor(ConstantTensorDescriptor) { using TDesc = ConstantTensorDescriptor; @@ -63,7 +64,18 @@ __host__ __device__ constexpr auto make_ConstantMatrixDescriptor(ConstantTensorD TDesc::GetStrides()[0]>{}; } -template +template +__host__ __device__ constexpr auto make_ConstantMatrixDescriptor(NativeTensorDescriptor) +{ + using TDesc = NativeTensorDescriptor; + static_assert(TDesc::GetNumOfDimension() == 2, "wrong"); + static_assert(TDesc::GetStrides()[1] == 1, "wrong"); + return ConstantMatrixDescriptor{}; +} + +template __host__ __device__ void print_ConstantMatrixDescriptor(TDesc, const char* s) { printf( diff --git a/composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp b/composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp index b0f6d32ec8..65a7b219f6 100644 --- a/composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp +++ b/composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp @@ -6,7 +6,7 @@ namespace ck { template -__host__ __device__ constexpr auto calculate_tensor_strides_packed(Lengths) +__host__ __device__ constexpr auto calculate_tensor_strides_packed_old(Lengths) { return reverse_inclusive_scan_sequence( Lengths{}.PopFront(), math::multiplies{}, Number<1>{}) @@ -14,12 +14,12 @@ __host__ __device__ constexpr auto calculate_tensor_strides_packed(Lengths) } template -__host__ __device__ constexpr auto calculate_tensor_strides_aligned(Lengths, Number) +__host__ __device__ constexpr auto calculate_tensor_strides_aligned_old(Lengths, Number) { constexpr index_t L_back_align = Align * math::integer_divide_ceiler{}(Lengths{}.Back(), Align); - return calculate_tensor_strides_packed( + return calculate_tensor_strides_packed_old( Lengths{}.Modify(Number{}, Number{})); } @@ -187,7 +187,7 @@ struct ConstantTensorDescriptor { Array multi_id; - using PackedStrides = decltype(calculate_tensor_strides_packed(GetLengths())); + using PackedStrides = decltype(calculate_tensor_strides_packed_old(GetLengths())); // calculate index in each of the dimensions in the order of their dimension static_for<0, nDim - 1, 1>{}(lambda_GetMultiIndexFrom1dIndex(id, multi_id)); @@ -468,7 +468,7 @@ struct ConstantTensorDescriptor __host__ __device__ static constexpr auto Pack() { - using packed_strides = decltype(calculate_tensor_strides_packed(Lengths{})); + using packed_strides = decltype(calculate_tensor_strides_packed_old(Lengths{})); return ConstantTensorDescriptor{}; } @@ -490,7 +490,7 @@ struct ConstantTensorDescriptor template __host__ __device__ constexpr auto make_ConstantTensorDescriptor_packed(Lengths) { - using Strides = decltype(calculate_tensor_strides_packed(Lengths{})); + using Strides = decltype(calculate_tensor_strides_packed_old(Lengths{})); return ConstantTensorDescriptor{}; } @@ -503,7 +503,7 @@ __host__ __device__ constexpr auto make_ConstantTensorDescriptor(Lengths, Stride template __host__ __device__ constexpr auto make_ConstantTensorDescriptor_aligned(Lengths, Number) { - using Strides = decltype(calculate_tensor_strides_aligned(Lengths{}, Number{})); + using Strides = decltype(calculate_tensor_strides_aligned_old(Lengths{}, Number{})); return ConstantTensorDescriptor{}; } diff --git a/composable_kernel/include/tensor_description/multi_index_transform.hpp b/composable_kernel/include/tensor_description/multi_index_transform.hpp index 7becf3093e..d888f87d81 100644 --- a/composable_kernel/include/tensor_description/multi_index_transform.hpp +++ b/composable_kernel/include/tensor_description/multi_index_transform.hpp @@ -24,8 +24,6 @@ struct PassThrough __host__ __device__ static constexpr auto GetNumOfUpperDimension() { return Number<1>{}; } - __host__ __device__ static constexpr auto GetLowerLengths() { return Sequence{}; } - __host__ __device__ static constexpr auto GetUpperLengths() { return Sequence{}; } __host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up) @@ -51,11 +49,11 @@ struct PassThrough } }; -// LowLengths: Sequence<...> -template +// LowerLengths: Sequence<...> +template struct Pad { - static constexpr index_t nDim = LowLengths::Size(); + static constexpr index_t nDim = LowerLengths::Size(); using LowerIndex = MultiIndex; using UpperIndex = MultiIndex; @@ -64,11 +62,9 @@ struct Pad __host__ __device__ static constexpr auto GetNumOfUpperDimension() { return Number{}; } - __host__ __device__ static constexpr auto GetLowerLengths() { return LowLengths{}; } - __host__ __device__ static constexpr auto GetUpperLengths() { - return GetLowerLengths() + LeftPads{} + RightPads{}; + return LowerLengths{} + LeftPads{} + RightPads{}; } __host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up) @@ -98,7 +94,7 @@ struct Pad // only check if there is right-padding static_if<(RightPads::At(idim) != 0)>{}([&](auto) { - flag = flag || idx_up[idim] >= LeftPads::At(idim) + LowLengths::At(idim); + flag = flag || idx_up[idim] >= LeftPads::At(idim) + LowerLengths::At(idim); }); }); @@ -106,11 +102,11 @@ struct Pad } }; -// LowLengths: Sequence<...> -template +// LowerLengths: Sequence<...> +template struct Merge { - static constexpr index_t nDimLow = LowLengths::Size(); + static constexpr index_t nDimLow = LowerLengths::Size(); static constexpr index_t nDimUp = 1; using LowerIndex = MultiIndex; @@ -120,12 +116,10 @@ struct Merge __host__ __device__ static constexpr auto GetNumOfUpperDimension() { return Number{}; } - __host__ __device__ static constexpr auto GetLowerLengths() { return LowLengths{}; } - __host__ __device__ static constexpr auto GetUpperLengths() { return Sequence{}, Number<1>{})>{}; + LowerLengths{}, math::multiplies{}, Number<1>{})>{}; } // emulate constexpr lambda @@ -158,11 +152,11 @@ struct Merge constexpr auto pseudo_low_strides = reverse_inclusive_scan_sequence( - GetLowerLengths().PopFront(), math::multiplies{}, Number<1>{}) + LowerLengths::PopFront(), math::multiplies{}, Number<1>{}) .PushBack(Number<1>{}); -// calculate index in each of the dimensions in the order of their dimension -#if 1 // would compile to same ISA? +#if 1 // would these 2 versions be compiled to same ISA? + // calculate index in each of the dimensions in the order of their dimension static_for<0, nDimLow - 1, 1>{}( lambda_CalculateLowerIndex(itmp, idx_low)); @@ -176,16 +170,75 @@ struct Merge } // idx_low_diff depends on idx_low_old, so idx_low need to be up-to-date + // If idx_up_diff is known at compile-time, many calculations can be optimized + // away by compiler + // This function assume idx_low_old is not out-of-bound __host__ __device__ static constexpr auto CalculateLowerIndexDiff(const UpperIndex& idx_up_diff, const UpperIndex& /* idx_up_old */, const LowerIndex& idx_low_old) { - LowerIndex idx_low_diff; + // do nothing if idx_up_diff == 0 + if(idx_up_diff[0] == 0) + { + return make_zero_array(); + } - // not implemeneted + // CalculateLowerIndex(idx_up_diff) has multiple integer divisions. + // If idx_up_diff is known at compile-time, the calculation can + // be done at compile-time. However, if idx_up_diff is only known + // at run-time, then the calculation will also be computed at + // run-time, and can be very expensive. + LowerIndex idx_low_new = idx_low_old + CalculateLowerIndex(idx_up_diff); - return idx_low_diff; + if(idx_up_diff[0] > 0) + { + bool carry = false; + + // do carry check in reversed order, starting from lowest dimension + // don't check the highest dimension + static_for<0, nDimLow, 1>{}([&](auto ireverse) { + constexpr index_t i = nDimLow - 1 - ireverse; + + if(carry) + { + ++idx_low_new(i); + } + + carry = false; + + if(idx_low_new[i] >= LowerLengths::At(i)) + { + idx_low_new(i) -= LowerLengths::At(i); + carry = true; + } + }); + } + else if(idx_up_diff[0] < 0) + { + bool borrow = false; + + // do borrow check in reversed order, starting from lowest dimension + // don't check the highest dimension + static_for<0, nDimLow, 1>{}([&](auto ireverse) { + constexpr index_t i = nDimLow - 1 - ireverse; + + if(borrow) + { + --idx_low_new(i); + } + + borrow = false; + + if(idx_low_new[i] < 0) + { + idx_low_new(i) += LowerLengths::At(i); + borrow = true; + } + }); + } + + return idx_low_new - idx_low_old; } __host__ __device__ static constexpr bool IsLinearTransform() { return false; } @@ -198,12 +251,12 @@ struct Merge } }; -// UpLengths: Sequence<...> -template +// UpperLengths: Sequence<...> +template struct Unmerge { static constexpr index_t nDimLow = 1; - static constexpr index_t nDimUp = UpLengths::Size(); + static constexpr index_t nDimUp = UpperLengths::Size(); using LowerIndex = MultiIndex; using UpperIndex = MultiIndex; @@ -212,23 +265,16 @@ struct Unmerge __host__ __device__ static constexpr auto GetNumOfUpperDimension() { return Number{}; } - __host__ __device__ static constexpr auto GetLowerLengths() - { - constexpr index_t low_length = - accumulate_on_sequence(UpLengths{}, math::multiplies{}, Number<1>{}); - - return Sequence{}; - } - - __host__ __device__ static constexpr auto GetUpperLengths() { return UpLengths{}; } + __host__ __device__ static constexpr auto GetUpperLengths() { return UpperLengths{}; } __host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up) { LowerIndex idx_low{0}; constexpr auto pseudo_up_strides = - typename sequence_reverse_inclusive_scan, 1>:: - type{}; + reverse_inclusive_scan_sequence( + UpperLengths::PopFront(), math::multiplies{}, Number<1>{}) + .PushBack(Number<1>{}); static_for<0, nDimUp, 1>{}( [&](auto idim) { idx_low(0) += idx_up[idim] * pseudo_up_strides[idim]; }); @@ -245,47 +291,45 @@ struct Unmerge } __host__ __device__ static constexpr bool IsLinearTransform() { return true; } + + // TODO: should this function be here? should it be specific for padding check? + __host__ __device__ static constexpr bool + IsUpperIndexInPaddingArea(const UpperIndex& /* idx_up */) + { + return false; + } }; -// UpLengths: Sequence<...> +// UpperLengths: Sequence<...> // Coefficients: Sequence<...> // idx_low = coefficients[0, ...nDimUp-1] * idx_up[0, ...nDimUp-1] + coefficients[nDimUp] -template +template struct Embed { static constexpr index_t nDimLow = 1; - static constexpr index_t nDimUp = UpLengths::Size(); + static constexpr index_t nDimUp = UpperLengths::Size(); using LowerIndex = MultiIndex; using UpperIndex = MultiIndex; __host__ __device__ explicit constexpr Embed() { - static_assert(UpLengths::GetSize() == nDimUp && Coefficients::GetSize() == nDimUp + 1, + static_assert(UpperLengths::GetSize() == nDimUp && Coefficients::GetSize() == nDimUp + 1, "wrong! # of dimensions not consistent"); - - constexpr index_t low_id_max = - Coefficients::Back() + accumulate_on_sequence(UpLengths{} * Coefficients::PopBack(), - math::plus{}, - Number<0>{}); - - static_assert(low_id_max < LowLength, "wrong! lower-id will go out of range"); } __host__ __device__ static constexpr auto GetNumOfUpperDimension() { return Number{}; } __host__ __device__ static constexpr auto GetNumOfLowerDimension() { return Number{}; } - __host__ __device__ static constexpr auto GetLowerLengths() { return Sequence{}; } - - __host__ __device__ static constexpr auto GetUpperLengths() { return UpLengths{}; } + __host__ __device__ static constexpr auto GetUpperLengths() { return UpperLengths{}; } __host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up) { LowerIndex idx_low(Coefficients{}[nDimUp]); static_for<0, nDimUp, 1>{}( - [&](auto idim) { idx_low[0] += idx_up[idim] * Coefficients{}[idim]; }); + [&](auto idim) { idx_low(0) += idx_up[idim] * Coefficients{}[idim]; }); return idx_low; } @@ -298,12 +342,18 @@ struct Embed LowerIndex idx_low_diff{0}; static_for<0, nDimUp, 1>{}( - [&](auto idim) { idx_low_diff[0] += idx_up_diff[idim] * Coefficients{}[idim]; }); + [&](auto idim) { idx_low_diff(0) += idx_up_diff[idim] * Coefficients{}[idim]; }); return idx_low_diff; } __host__ __device__ static constexpr bool IsLinearTransform() { return true; } + + __host__ __device__ static constexpr bool + IsUpperIndexInPaddingArea(const UpperIndex& /* idx_up */) + { + return false; + } }; } // namespace ck diff --git a/composable_kernel/include/tensor_description/tensor_descriptor.hpp b/composable_kernel/include/tensor_description/tensor_descriptor.hpp index 7f65912e6d..0113997a80 100644 --- a/composable_kernel/include/tensor_description/tensor_descriptor.hpp +++ b/composable_kernel/include/tensor_description/tensor_descriptor.hpp @@ -207,10 +207,12 @@ struct TransformedTensorDescriptor return LowTensorDescriptor{}; } +#if 0 __host__ __device__ static constexpr auto GetLowerLengths() { return GetLowerTensorDescriptor().GetLengths(); } +#endif struct lambda_GetUpperLengths { @@ -383,35 +385,5 @@ struct TransformedTensorDescriptor } }; -template -__host__ __device__ constexpr auto make_native_tensor_descriptor(Sequence, - Sequence) -{ - return NativeTensorDescriptor...>{}; -} - -template -__host__ __device__ constexpr auto make_native_tensor_descriptor_packed(Lengths) -{ - constexpr auto strides = reverse_inclusive_scan_sequence( - Lengths::PopFront(), math::multiplies{}, Number<1>{}) - .PushBack(Number<1>{}); - - return make_native_tensor_descriptor(Lengths{}, strides); -} - -template -__host__ __device__ constexpr auto - transform_tensor_descriptor(LowTensorDescriptor, Transforms, LowDimensionIds, UpDimensionIds) -{ - return TransformedTensorDescriptor{}; -} - } // namespace ck #endif diff --git a/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp b/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp index 603cd1e3d6..52dc1642e6 100644 --- a/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp +++ b/composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp @@ -6,6 +6,96 @@ namespace ck { +template +__host__ __device__ constexpr auto calculate_tensor_strides_packed(Lengths) +{ + return reverse_inclusive_scan_sequence( + Lengths{}.PopFront(), math::multiplies{}, Number<1>{}) + .PushBack(Number<1>{}); +} + +template +__host__ __device__ constexpr auto calculate_tensor_strides_aligned(Lengths, Number) +{ + constexpr index_t L_back_align = + Align * math::integer_divide_ceiler{}(Lengths{}.Back(), Align); + + return calculate_tensor_strides_packed( + Lengths{}.Modify(Number{}, Number{})); +} + +template +__host__ __device__ constexpr auto make_native_tensor_descriptor(Sequence, + Sequence) +{ + return NativeTensorDescriptor...>{}; +} + +template +__host__ __device__ constexpr auto make_native_tensor_descriptor_packed(Lengths) +{ + constexpr auto strides = calculate_tensor_strides_packed(Lengths{}); + + return make_native_tensor_descriptor(Lengths{}, strides); +} + +template +__host__ __device__ constexpr auto make_native_tensor_descriptor_aligned(Lengths, Number) +{ + constexpr auto strides = calculate_tensor_strides_aligned(Lengths{}, Number{}); + return make_native_tensor_descriptor(Lengths{}, strides); +} + +template +__host__ __device__ constexpr auto + transform_tensor_descriptor(LowTensorDescriptor, Transforms, LowDimensionIds, UpDimensionIds) +{ + return TransformedTensorDescriptor{}; +} + +template +__host__ __device__ constexpr auto reorder_tensor_descriptor_impl(LowerTensorDescriptor, + Sequence, + Sequence, + Sequence) +{ + return TransformedTensorDescriptor...>, + Tuple...>, + Tuple...>>{}; +} + +template +__host__ __device__ constexpr auto + reorder_tensor_descriptor_given_lower2upper(LowerTensorDescriptor, MapLower2Upper) +{ + static_assert(is_valid_sequence_map{}, + "wrong! MapLower2Upper is not a valid map"); + + return reorder_tensor_descriptor_impl( + LowerTensorDescriptor{}, + LowerTensorDescriptor::GetLengths(), + typename arithmetic_sequence_gen<0, LowerTensorDescriptor::GetNumOfDimension(), 1>::type{}, + MapLower2Upper{}); +} + +template +__host__ __device__ constexpr auto + reorder_tensor_descriptor_given_upper2lower(LowerTensorDescriptor, MapUpper2Lower) +{ + return reorder_tensor_descriptor_given_lower2upper( + LowerTensorDescriptor{}, typename sequence_map_inverse::type{}); +} + template __host__ __device__ void print_tensor_descriptor(const char* s, const NativeTensorDescriptor& desc) diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp index 723f6766da..56de4c6f66 100644 --- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp @@ -951,10 +951,10 @@ struct ThreadwiseGenericTensorSliceCopy_v3r1 // The dimension access order should be the same on src and dst. // It is designed for cases, where one of src and dst is register, and // the other is device memory or LDS -template diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp index 558abb207e..55dacfa289 100644 --- a/driver/src/driver.cpp +++ b/driver/src/driver.cpp @@ -91,8 +91,8 @@ int main(int argc, char* argv[]) // 3x3, 34x34 constexpr index_t N = 64; constexpr index_t C = 256; - constexpr index_t HI = 34; - constexpr index_t WI = 34; + constexpr index_t HI = 32; + constexpr index_t WI = 32; constexpr index_t K = 128; constexpr index_t Y = 3; constexpr index_t X = 3; @@ -100,8 +100,8 @@ int main(int argc, char* argv[]) using ConvStrides = Sequence<1, 1>; using ConvDilations = Sequence<1, 1>; - using LeftPads = Sequence<0, 0>; - using RightPads = Sequence<0, 0>; + using LeftPads = Sequence<1, 1>; + using RightPads = Sequence<1, 1>; #elif 0 // 1x1 filter, 8x8 image // cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42%