From 37b82b7e5484e510c65c01efb9a5421498e3db96 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Wed, 19 Jun 2019 22:26:45 -0500 Subject: [PATCH] refactor --- ...mm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp | 22 +++++-- .../ConstantMergedTensorDescriptor.hpp | 8 +-- .../ConstantTensorDescriptor.hpp | 28 ++++---- .../blockwise_generic_tensor_slice_copy.hpp | 52 ++++++++------- .../include/utility/Sequence.hpp | 35 +++++----- .../include/utility/integral_constant.hpp | 64 ++++++++++++++----- composable_kernel/include/utility/math.hpp | 16 ++--- 7 files changed, 139 insertions(+), 86 deletions(-) diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp index d5ea777824..65c397564f 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -84,6 +84,12 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer constexpr index_t Y = wei_k_c_y_x_global_desc.GetLength(I2); constexpr index_t X = wei_k_c_y_x_global_desc.GetLength(I3); + constexpr index_t ConvStrideH = ConvStrides{}[0]; + constexpr index_t ConvStrideW = ConvStrides{}[1]; + + constexpr index_t ConvDilationH = ConvDilations{}[0]; + constexpr index_t ConvDilationW = ConvDilations{}[1]; + static_assert(N % (N1 * N2) == 0, "wrong! cannot divice N evenly among thread"); constexpr index_t N0 = N / (N1 * N2); @@ -92,6 +98,14 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer constexpr index_t E = C * Y * X; + // sanity-check for vectorized memory load + static_assert(ConvStrideW == 1 || InBlockCopySrcDataPerRead_B == 1, + "wrong! global vector load of input tensor is wrong"); + + static_assert((X == 1 || ConvDilationW % InBlockCopySrcDataPerRead_B == 0), + "wrong! aligment requirement for vectorized global load of input tensor will " + "be violated"); + // divide block work by [K, B] static_assert(K % KPerBlock == 0 && B % BPerBlock == 0 && E % (2 * EPerBlock) == 0, "wrong! cannot divide work evenly among block"); @@ -111,15 +125,15 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw_lds_double_buffer // input tensor // tensor descriptor in device memory [N0, N1, N2, Ho, Wo] constexpr auto in_n0_n1_n2_h_w_global_desc = - in_n_c_h_w_global_desc.StridedSlice(I2, Number{}, Number{}) - .StridedSlice(I3, Number{}, Number{}) + in_n_c_h_w_global_desc.StridedSlice(I2, Number{}, Number{}) + .StridedSlice(I3, Number{}, Number{}) .Fold(I0, Number{}, Number{}) .Extract(Sequence<0, 1, 2, 4, 5>{}); // batch descritpor for device memory constexpr auto in_c_y_x_global_desc = - in_n_c_h_w_global_desc.StridedSlice(I2, Number{}, Number{}) - .StridedSlice(I3, Number{}, Number{}) + in_n_c_h_w_global_desc.StridedSlice(I2, Number{}, Number{}) + .StridedSlice(I3, Number{}, Number{}) .Extract(Sequence<1, 2, 3>{}); // merged tensor descriptor in device memory [E, N1, B, N2], src of blockwise copy diff --git a/composable_kernel/include/tensor_description/ConstantMergedTensorDescriptor.hpp b/composable_kernel/include/tensor_description/ConstantMergedTensorDescriptor.hpp index 757f0ad691..700f80845e 100644 --- a/composable_kernel/include/tensor_description/ConstantMergedTensorDescriptor.hpp +++ b/composable_kernel/include/tensor_description/ConstantMergedTensorDescriptor.hpp @@ -37,7 +37,7 @@ struct ConstantMergedTensorDescriptor return OriginalTensorDesc{}; } - __host__ __device__ static constexpr index_t GetNumOfDimension() { return nDim; } + __host__ __device__ static constexpr auto GetNumOfDimension() { return Number{}; } template __host__ __device__ static constexpr auto GetContainedOriginalDimensions(Number) @@ -52,7 +52,7 @@ struct ConstantMergedTensorDescriptor } template - __host__ __device__ static constexpr index_t GetLength(Number) + __host__ __device__ static constexpr auto GetLength(Number) { constexpr auto original_dims_partial = std::get(mOriginalDimMergeSeqs); @@ -60,7 +60,7 @@ struct ConstantMergedTensorDescriptor } template - __host__ __device__ static constexpr index_t GetStride(Number) + __host__ __device__ static constexpr auto GetStride(Number) { static_assert(!ContainMultipleOriginalDimensions(Number{}), "wrong! stride of a merged dimension is undefined"); @@ -75,7 +75,7 @@ struct ConstantMergedTensorDescriptor return Sequence{}; } - __host__ __device__ static constexpr index_t GetElementSize() + __host__ __device__ static constexpr auto GetElementSize() { return OriginalTensorDesc::GetElementSize(); } diff --git a/composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp b/composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp index 3ed9f3a2b8..5fad7a46a1 100644 --- a/composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp +++ b/composable_kernel/include/tensor_description/ConstantTensorDescriptor.hpp @@ -43,22 +43,22 @@ struct ConstantTensorDescriptor return Sequence{}; } - __host__ __device__ static constexpr index_t GetNumOfDimension() { return nDim; } + __host__ __device__ static constexpr auto GetNumOfDimension() { return Number{}; } __host__ __device__ static constexpr auto GetLengths() { return Lengths{}; } __host__ __device__ static constexpr auto GetStrides() { return Strides{}; } - template - __host__ __device__ static constexpr index_t GetLength(Number) + template + __host__ __device__ static constexpr auto GetLength(IDim) { - return Lengths::Get(Number{}); + return Lengths::Get(IDim{}); } - template - __host__ __device__ static constexpr index_t GetStride(Number) + template + __host__ __device__ static constexpr auto GetStride(IDim) { - return Strides::Get(Number{}); + return Strides::Get(IDim{}); } struct lambda_AreDimensionsContinuous @@ -102,17 +102,18 @@ struct ConstantTensorDescriptor return false; } - __host__ __device__ static constexpr index_t GetElementSize() + __host__ __device__ static constexpr auto GetElementSize() { - return accumulate_on_sequence(Lengths{}, math::multiplies{}, Number<1>{}); + return Number{}, Number<1>{})>{}; } - __host__ __device__ static constexpr index_t GetElementSpace() + __host__ __device__ static constexpr auto GetElementSpace() { constexpr index_t element_space_unaligned = accumulate_on_sequence( (GetLengths() - Number<1>{}) * GetStrides(), math::plus{}, Number<1>{}); - return element_space_unaligned; + return Number{}; } // emulate constexpr lambda @@ -156,13 +157,14 @@ struct ConstantTensorDescriptor } template - __host__ __device__ static constexpr index_t GetOffsetFromMultiIndex(Sequence) + __host__ __device__ static constexpr auto GetOffsetFromMultiIndex(Sequence) { static_assert(sizeof...(Is) == nDim, "wrong! Dimension not consistent"); constexpr auto multi_id = Sequence{}; - return accumulate_on_sequence(multi_id * GetStrides(), math::plus{}, Number<0>{}); + return Number{}, Number<0>{})>{}; } // emulate constexpr lambda diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp index ed633158e8..e7fb7e2a71 100644 --- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp @@ -83,9 +83,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 // divide work constexpr auto data_per_cluster_per_dims = SubLengths{} * DataClusterLengths{}; - static_for<0, nDim, 1>{}([&](auto IDim_) { - constexpr auto IDim = decltype(IDim_){}; - + static_for<0, nDim, 1>{}([&](auto IDim) { static_assert(SliceLengths::Get(IDim) % SubLengths::Get(IDim) == 0, "wrong! cannot evenly divide sliced tensor into sub-tensor"); @@ -95,9 +93,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 // for now, only support SubLengths == 1 on a merged dimension that constains // multiple original dimensions - static_for<0, nDim, 1>{}([&](auto IDim_) { - constexpr auto IDim = decltype(IDim_){}; - + static_for<0, nDim, 1>{}([&](auto IDim) { static_assert(SubLengths::Get(IDim) == 1 || (!SrcDesc::ContainMultipleOriginalDimensions(IDim) && !DstDesc::ContainMultipleOriginalDimensions(IDim)), @@ -121,8 +117,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 dst_block_data_multi_id_begin + thread_data_multi_id_begin); // partial offset on each dimension - static_for<0, nDim, 1>{}([&](auto IDim_) { - constexpr auto IDim = decltype(IDim_){}; + static_for<0, nDim, 1>{}([&](auto IDim) { constexpr index_t idim = IDim; constexpr auto src_partial_original_dims = @@ -135,8 +130,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 extract_array(mThreadSrcOriginalMultiId, src_partial_original_dims)); }); - static_for<0, nDim, 1>{}([&](auto IDim_) { - constexpr auto IDim = decltype(IDim_){}; + static_for<0, nDim, 1>{}([&](auto IDim) { constexpr index_t idim = IDim; constexpr auto dst_partial_original_dims = @@ -208,6 +202,13 @@ struct BlockwiseGenericTensorSliceCopy_v1 thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id_begin); #endif + // By position the origin of the per-thread window at the point, where multi-index + // of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy + // is assuming each thread is copy a noraml (not merged) tensor. + // User need to guarantee this is true. + // By setting SubLengths = 1 at the merged dimension, this is always true; + // If in the future, you want to enable SubLengths > 1 at the merged dimension, + // special care in implementation is needed threadwise_generic_tensor_slice_copy_v1(SrcDesc{}, p_src + src_offset + mThreadSrcOffset, make_zero_array(), @@ -259,6 +260,13 @@ struct BlockwiseGenericTensorSliceCopy_v1 const index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex(dst_data_multi_id_begin); #endif + // By position the origin of the per-thread window at the point, where multi-index + // of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy + // is assuming each thread is copy a noraml (not merged) tensor. + // User need to guarantee this is true. + // By setting SubLengths = 1 at the merged dimension, this is always true; + // If in the future, you want to enable SubLengths > 1 at the merged dimension, + // special care in implementation is needed threadwise_generic_tensor_slice_copy_v1(thread_tensor_desc, p_clipboard + clipboard_offset, make_zero_array(), @@ -292,8 +300,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 __device__ void MoveSlicingWindowOnSourceTensor( Number, Number, integral_constant direction) { - constexpr auto IDim = Number{}; - constexpr index_t idim = IDim; + constexpr auto IDim = Number{}; static_if{}([&](auto) { // logic for a merged dimension, also works for non-merged dimension, but its logic may @@ -316,22 +323,21 @@ struct BlockwiseGenericTensorSliceCopy_v1 old_src_partial_original_multi_id, StepSize, direction); // update "mThreadSrcOriginalMultiId" - static_for<0, decltype(src_partial_original_dims)::GetSize(), 1>{}([&](auto I_) { - constexpr auto I = decltype(I_){}; - constexpr index_t idim_original = src_partial_original_dims.Get(I); + static_for<0, decltype(src_partial_original_dims)::GetSize(), 1>{}([&](auto I) { + constexpr auto IDimOriginal = src_partial_original_dims[I]; - mThreadSrcOriginalMultiId(idim_original) = new_src_partial_original_multi_id[I]; + mThreadSrcOriginalMultiId(IDimOriginal) = new_src_partial_original_multi_id[I]; }); // calculate new partial offset on this merged dimension - const index_t old_src_partial_offset = mThreadSrcPartialOffsets[idim]; + const index_t old_src_partial_offset = mThreadSrcPartialOffsets[IDim]; const index_t new_src_partial_offset = src_partial_original_desc.GetOffsetFromMultiIndex( new_src_partial_original_multi_id); // update "mThreadSrcPartialOffsets" - mThreadSrcPartialOffsets(idim) = new_src_partial_offset; + mThreadSrcPartialOffsets(IDim) = new_src_partial_offset; // update "mThreadSrcOffset", do "+" before "-" to avoid underflow mThreadSrcOffset = (mThreadSrcOffset + new_src_partial_offset) - old_src_partial_offset; @@ -346,20 +352,20 @@ struct BlockwiseGenericTensorSliceCopy_v1 // of the boundary of the tensor being sliced. Otherwise, there might be hazard like // unsigned integer underflow. That is NO runtime sanity check to prevent the hazard - constexpr index_t idim_original = SrcDesc::GetContainedOriginalDimensions(IDim).Front(); + constexpr auto IDimOriginal = SrcDesc::GetContainedOriginalDimensions(IDim).Front(); static_if{}([&](auto fwd) { mThreadSrcOffset += StepSize * fwd(SrcDesc{}).GetStride(IDim); - mThreadSrcOriginalMultiId(idim_original) += StepSize; + mThreadSrcOriginalMultiId(IDimOriginal) += StepSize; - mThreadSrcPartialOffsets(idim) += StepSize * fwd(SrcDesc{}).GetStride(IDim); + mThreadSrcPartialOffsets(IDim) += StepSize * fwd(SrcDesc{}).GetStride(IDim); }).Else([&](auto fwd) { mThreadSrcOffset -= StepSize * fwd(SrcDesc{}).GetStride(IDim); - mThreadSrcOriginalMultiId(idim_original) -= StepSize; + mThreadSrcOriginalMultiId(IDimOriginal) -= StepSize; - mThreadSrcPartialOffsets(idim) -= StepSize * fwd(SrcDesc{}).GetStride(IDim); + mThreadSrcPartialOffsets(IDim) -= StepSize * fwd(SrcDesc{}).GetStride(IDim); }); }); } diff --git a/composable_kernel/include/utility/Sequence.hpp b/composable_kernel/include/utility/Sequence.hpp index a579bef8c5..ec56211479 100644 --- a/composable_kernel/include/utility/Sequence.hpp +++ b/composable_kernel/include/utility/Sequence.hpp @@ -16,31 +16,32 @@ struct Sequence static constexpr index_t mSize = sizeof...(Is); - __host__ __device__ static constexpr index_t GetSize() { return mSize; } + __host__ __device__ static constexpr auto GetSize() { return Number{}; } - template - __host__ __device__ static constexpr index_t Get(Number) + __host__ __device__ static constexpr index_t GetImpl(index_t I) { - static_assert(I < mSize, "wrong! I too large"); - // the last dummy element is to prevent compiler complain about empty array, when mSize = 0 const index_t mData[mSize + 1] = {Is..., 0}; return mData[I]; } template - __host__ __device__ constexpr auto operator[](Number) const + __host__ __device__ static constexpr auto Get(Number) { - return Number{})>{}; + static_assert(I < mSize, "wrong! I too large"); + + return Number{})>{}; } - // make sure I is constepxr - __host__ __device__ constexpr index_t operator[](index_t I) const + template + __host__ __device__ constexpr auto operator[](Number) const { - const index_t mData[mSize + 1] = {Is..., 0}; - return mData[I]; + return Get(Number{}); } + // make sure I is constepxr if you want a constexpr return type + __host__ __device__ constexpr index_t operator[](index_t I) const { return GetImpl(I); } + template __host__ __device__ static constexpr auto ReorderGivenNew2Old(Sequence /*new2old*/) { @@ -54,16 +55,16 @@ struct Sequence __host__ __device__ static constexpr auto Reverse(); - __host__ __device__ static constexpr index_t Front() + __host__ __device__ static constexpr auto Front() { - const index_t mData[mSize + 1] = {Is..., 0}; - return mData[0]; + static_assert(mSize > 0, "wrong!"); + return Get(Number<0>{}); } - __host__ __device__ static constexpr index_t Back() + __host__ __device__ static constexpr auto Back() { - const index_t mData[mSize + 1] = {Is..., 0}; - return mData[mSize - 1]; + static_assert(mSize > 0, "wrong!"); + return Get(Number{}); } __host__ __device__ static constexpr auto PopFront(); diff --git a/composable_kernel/include/utility/integral_constant.hpp b/composable_kernel/include/utility/integral_constant.hpp index e4c213d7af..cd2819398f 100644 --- a/composable_kernel/include/utility/integral_constant.hpp +++ b/composable_kernel/include/utility/integral_constant.hpp @@ -13,21 +13,6 @@ struct integral_constant __host__ __device__ constexpr value_type operator()() const noexcept { return value; } }; -template -__host__ __device__ constexpr auto operator+(integral_constant, integral_constant) -{ - return integral_constant{}; -} - -template -__host__ __device__ constexpr auto operator*(integral_constant, integral_constant) -{ - return integral_constant{}; -} - -template -using Number = integral_constant; - template struct is_same : public integral_constant { @@ -38,5 +23,54 @@ struct is_same : public integral_constant { }; +template +using Number = integral_constant; + +template +__host__ __device__ constexpr auto operator+(Number, Number) +{ + return Number{}; +} + +template +__host__ __device__ constexpr auto operator-(Number, Number) +{ + static_assert(Y <= X, "wrong!"); + return Number{}; +} + +template +__host__ __device__ constexpr auto operator*(Number, Number) +{ + return Number{}; +} + +template +__host__ __device__ constexpr auto operator/(Number, Number) +{ + static_assert(Y > 0, "wrong!"); + return Number{}; +} + +template +__host__ __device__ constexpr auto operator%(Number, Number) +{ + static_assert(Y > 0, "wrong!"); + return Number{}; +} + +#if 0 +static constexpr Number<0> 0_c; +static constexpr Number<1> 1_c; +static constexpr Number<2> 2_c; +static constexpr Number<3> 3_c; +static constexpr Number<4> 4_c; +static constexpr Number<5> 5_c; +static constexpr Number<6> 6_c; +static constexpr Number<7> 7_c; +static constexpr Number<8> 8_c; +static constexpr Number<9> 9_c; +#endif + } // namespace ck #endif diff --git a/composable_kernel/include/utility/math.hpp b/composable_kernel/include/utility/math.hpp index 92a802a1c9..197759ad25 100644 --- a/composable_kernel/include/utility/math.hpp +++ b/composable_kernel/include/utility/math.hpp @@ -42,20 +42,16 @@ struct integer_divide_ceiler } }; -template -__host__ __device__ constexpr T integer_divide_ceil(T a, T b) +template +__host__ __device__ constexpr auto integer_divide_ceil(X x, Y y) { - static_assert(is_same{} || is_same{}, "wrong type"); - - return (a + b - 1) / b; + return (x + y - 1) / y; } -template -__host__ __device__ constexpr T integer_least_multiple(T a, T b) +template +__host__ __device__ constexpr auto integer_least_multiple(X x, Y y) { - static_assert(is_same{} || is_same{}, "wrong type"); - - return b * integer_divide_ceil(a, b); + return y * integer_divide_ceil(x, y); } template