From 7a89684f92cc39afbb13ad970c1a3282e60b9180 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Thu, 6 Jun 2019 16:50:35 -0500 Subject: [PATCH] refactor --- driver/driver.hip.cpp | 4 +- src/include/Array.hip.hpp | 80 +++---- .../ConstantMergedTensorDescriptor.hip.hpp | 76 +++---- src/include/ConstantTensorDescriptor.hip.hpp | 197 +++++++----------- src/include/Sequence.hip.hpp | 61 ++---- .../blockwise_generic_tensor_slice_op.hip.hpp | 16 +- src/include/common.hip.hpp | 2 +- src/include/functional.hip.hpp | 38 +--- src/include/functional2.hip.hpp | 77 +++---- ..._implicit_gemm_v1r1_chwn_cyxk_khwn.hip.hpp | 6 +- ..._implicit_gemm_v1r2_chwn_cyxk_khwn.hip.hpp | 4 +- ..._implicit_gemm_v1r2_nchw_cyxk_khwn.hip.hpp | 2 +- ..._implicit_gemm_v1r3_chwn_cyxk_khwn.hip.hpp | 4 +- ...3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp | 4 +- ...3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp | 4 +- ...3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp | 4 +- ..._implicit_gemm_v1r3_nchw_cyxk_khwn.hip.hpp | 4 +- ..._implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp | 4 +- ...on_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp | 2 +- ...2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp | 2 +- ...3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp | 76 +++---- ...on_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp | 34 ++- ...4_lds_double_buffer_nchw_kcyx_nkhw.hip.hpp | 41 +--- ...on_implicit_gemm_v4_nchw_kcyx_nkhw.hip.hpp | 70 ++++--- src/include/integral_constant.hip.hpp | 2 +- ...threadwise_generic_tensor_slice_op.hip.hpp | 2 +- 26 files changed, 299 insertions(+), 517 deletions(-) diff --git a/driver/driver.hip.cpp b/driver/driver.hip.cpp index 40cd4fdd3f..db13ffbba8 100644 --- a/driver/driver.hip.cpp +++ b/driver/driver.hip.cpp @@ -646,9 +646,9 @@ int main(int argc, char* argv[]) device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw #elif 0 device_convolution_implicit_gemm_v2_chwn_cyxk_khwn -#elif 0 - device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw #elif 1 + device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw +#elif 0 device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw #endif (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat); diff --git a/src/include/Array.hip.hpp b/src/include/Array.hip.hpp index 4e1162a8a3..33cc547a97 100644 --- a/src/include/Array.hip.hpp +++ b/src/include/Array.hip.hpp @@ -18,9 +18,21 @@ struct Array __host__ __device__ constexpr index_t GetSize() const { return NSize; } + template + __host__ __device__ constexpr TData operator[](Number) const + { + return mData[I]; + } + __host__ __device__ constexpr TData operator[](index_t i) const { return mData[i]; } - __host__ __device__ TData& operator[](index_t i) { return mData[i]; } + template + __host__ __device__ TData& operator()(Number) + { + return mData[I]; + } + + __host__ __device__ TData& operator()(index_t i) { return mData[i]; } template __host__ __device__ constexpr TData Get(Number) const @@ -44,10 +56,10 @@ struct Array static_for<0, NSize, 1>{}([&](auto I) { constexpr index_t i = I.Get(); - new_array[i] = mData[i]; + new_array(i) = mData[i]; }); - new_array[NSize] = x; + new_array(NSize) = x; return new_array; } @@ -62,20 +74,9 @@ __host__ __device__ constexpr auto sequence2array(Sequence) template __host__ __device__ constexpr auto make_zero_array() { -#if 0 - Array a; - - static_for<0, NSize, 1>{}([&](auto I) { - constexpr index_t i = I.Get(); - a[i] = static_cast(0); - }); - - return a; -#else constexpr auto zero_sequence = typename uniform_sequence_gen::SeqType{}; constexpr auto zero_array = sequence2array(zero_sequence); return zero_array; -#endif } template @@ -94,44 +95,26 @@ __host__ __device__ constexpr auto reorder_array_given_new2old(const Array -__host__ __device__ constexpr auto reorder_array_given_old2new(const Array& old_array, - Sequence old2new) -{ - Array new_array; - - static_assert(NSize == sizeof...(IRs), "NSize not consistent"); - - static_for<0, NSize, 1>{}([&](auto IDim) { - constexpr index_t idim = IDim.Get(); - new_array[old2new.Get(IDim)] = old_array[idim]; - }); - - return new_array; -} -#else template -struct reorder_array_given_old2new_impl +struct lambda_reorder_array_given_old2new { - const Array& old_array_ref; - Array& new_array_ref; + const Array& old_array; + Array& new_array; - __host__ - __device__ constexpr reorder_array_given_old2new_impl(const Array& old_array, - Array& new_array) - : old_array_ref(old_array), new_array_ref(new_array) + __host__ __device__ constexpr lambda_reorder_array_given_old2new( + const Array& old_array_, Array& new_array_) + : old_array(old_array_), new_array(new_array_) { } template __host__ __device__ constexpr void operator()(Number) const { - TData old_data = old_array_ref.Get(Number{}); + TData old_data = old_array[IOldDim]; constexpr index_t INewDim = MapOld2New::Get(Number{}); - new_array_ref.Set(Number{}, old_data); + new_array.Set(Number{}, old_data); } }; @@ -144,11 +127,10 @@ __host__ __device__ constexpr auto reorder_array_given_old2new(const Array{}( - reorder_array_given_old2new_impl>(old_array, new_array)); + lambda_reorder_array_given_old2new>(old_array, new_array)); return new_array; } -#endif template __host__ __device__ constexpr auto extract_array(const Array& old_array, ExtractSeq) @@ -161,7 +143,7 @@ __host__ __device__ constexpr auto extract_array(const Array& old_ static_for<0, new_size, 1>{}([&](auto I) { constexpr index_t i = I.Get(); - new_array[i] = old_array[ExtractSeq::Get(I)]; + new_array(i) = old_array[ExtractSeq::Get(I)]; }); return new_array; @@ -176,7 +158,7 @@ __host__ __device__ constexpr auto operator+(Array a, Array{}([&](auto I) { constexpr index_t i = I.Get(); - result[i] = a[i] + b[i]; + result(i) = a[i] + b[i]; }); return result; @@ -191,7 +173,7 @@ __host__ __device__ constexpr auto operator-(Array a, Array{}([&](auto I) { constexpr index_t i = I.Get(); - result[i] = a[i] - b[i]; + result(i) = a[i] - b[i]; }); return result; @@ -208,7 +190,7 @@ __host__ __device__ constexpr auto operator+(Array a, Sequence{}([&](auto I) { constexpr index_t i = I.Get(); - result[i] = a[i] + b.Get(I); + result(i) = a[i] + b.Get(I); }); return result; @@ -225,7 +207,7 @@ __host__ __device__ constexpr auto operator-(Array a, Sequence{}([&](auto I) { constexpr index_t i = I.Get(); - result[i] = a[i] - b.Get(I); + result(i) = a[i] - b.Get(I); }); return result; @@ -242,7 +224,7 @@ __host__ __device__ constexpr auto operator*(Array a, Sequence{}([&](auto I) { constexpr index_t i = I.Get(); - result[i] = a[i] * b.Get(I); + result(i) = a[i] * b.Get(I); }); return result; @@ -259,7 +241,7 @@ __host__ __device__ constexpr auto operator-(Sequence a, Array{}([&](auto I) { constexpr index_t i = I.Get(); - result[i] = a.Get(I) - b[i]; + result(i) = a.Get(I) - b[i]; }); return result; diff --git a/src/include/ConstantMergedTensorDescriptor.hip.hpp b/src/include/ConstantMergedTensorDescriptor.hip.hpp index 8d5ceb3825..2333035190 100644 --- a/src/include/ConstantMergedTensorDescriptor.hip.hpp +++ b/src/include/ConstantMergedTensorDescriptor.hip.hpp @@ -9,6 +9,8 @@ template struct ConstantMergedTensorDescriptor { + using Type = ConstantMergedTensorDescriptor; + static constexpr auto mOriginalDimMergeSeqs = std::tuple{}; static constexpr index_t nDim = sizeof...(OriginalDimMergeSeqs); @@ -74,43 +76,17 @@ struct ConstantMergedTensorDescriptor return OriginalTensorDesc::GetElementSize(); } -#if 0 - __host__ __device__ static constexpr auto - GetOriginalMultiIndexFromMultiIndex(Array multi_id) - { - Array original_multi_id; - - static_for<0, nDim, 1>{}([&](auto IDim) { - constexpr index_t idim = IDim.Get(); - constexpr auto original_dims_partial = std::get(mOriginalDimMergeSeqs); - - // get partial original-multi-id corresponding to this merged dimension - const auto original_multi_id_partial = - OriginalTensorDesc::Extract(original_dims_partial) - .GetMultiIndexFrom1dIndex(multi_id[idim]); - - static_for<0, original_dims_partial.GetSize(), 1>{}([&](auto I_) { - constexpr auto I = decltype(I_){}; - constexpr index_t idim_original = original_dims_partial.Get(I); - - original_multi_id[idim_original] = original_multi_id_partial[I.Get()]; - }); - }); - - return original_multi_id; - } -#else template - struct GetOriginalMultiIndexFromMultiIndex_impl1 + struct lambda_1_GetOriginalMultiIndexFromMultiIndex { - const Array& original_multi_id_partial_ref; - Array& original_multi_id_ref; + const Array& original_multi_id_partial; + Array& original_multi_id; - __host__ __device__ constexpr GetOriginalMultiIndexFromMultiIndex_impl1( - const Array& original_multi_id_partial, - Array& original_multi_id) - : original_multi_id_partial_ref(original_multi_id_partial), - original_multi_id_ref(original_multi_id) + __host__ __device__ constexpr lambda_1_GetOriginalMultiIndexFromMultiIndex( + const Array& original_multi_id_partial_, + Array& original_multi_id_) + : original_multi_id_partial(original_multi_id_partial_), + original_multi_id(original_multi_id_) { } @@ -119,37 +95,36 @@ struct ConstantMergedTensorDescriptor { constexpr index_t idim_original = OriginalDimsPartial::Get(Number{}); - index_t itmp = original_multi_id_partial_ref.Get(Number{}); + index_t itmp = original_multi_id_partial[I]; - original_multi_id_ref.Set(Number{}, itmp); + original_multi_id.Set(Number{}, itmp); } }; - struct GetOriginalMultiIndexFromMultiIndex_impl0 + struct lambda_0_GetOriginalMultiIndexFromMultiIndex { - const Array& multi_id_ref; - Array& original_multi_id_ref; + const Array& multi_id; + Array& original_multi_id; - __host__ __device__ constexpr GetOriginalMultiIndexFromMultiIndex_impl0( - const Array& multi_id, Array& original_multi_id) - : multi_id_ref(multi_id), original_multi_id_ref(original_multi_id) + __host__ __device__ constexpr lambda_0_GetOriginalMultiIndexFromMultiIndex( + const Array& multi_id_, Array& original_multi_id_) + : multi_id(multi_id_), original_multi_id(original_multi_id_) { } template __host__ __device__ constexpr void operator()(Number) const { - constexpr auto original_dims_partial = - std::get(std::tuple{}); + constexpr auto original_dims_partial = std::get(Type::mOriginalDimMergeSeqs); // get partial original-multi-id corresponding to this merged dimension const auto original_multi_id_partial = OriginalTensorDesc::Extract(original_dims_partial) - .GetMultiIndexFrom1dIndex(multi_id_ref[IDim]); + .GetMultiIndexFrom1dIndex(multi_id[IDim]); static_for<0, original_dims_partial.GetSize(), 1>{}( - GetOriginalMultiIndexFromMultiIndex_impl1( - original_multi_id_partial, original_multi_id_ref)); + lambda_1_GetOriginalMultiIndexFromMultiIndex( + original_multi_id_partial, original_multi_id)); } }; @@ -160,7 +135,7 @@ struct ConstantMergedTensorDescriptor Array original_multi_id; static_for<0, nDim, 1>{}( - GetOriginalMultiIndexFromMultiIndex_impl0(multi_id, original_multi_id)); + lambda_0_GetOriginalMultiIndexFromMultiIndex(multi_id, original_multi_id)); return original_multi_id; } @@ -174,7 +149,6 @@ struct ConstantMergedTensorDescriptor return OriginalTensorDesc::GetOffsetFromMultiIndex(original_multi_id); } -#endif __host__ __device__ static constexpr index_t GetOffsetFromMultiIndex(Array multi_id) @@ -192,9 +166,9 @@ struct ConstantMergedTensorDescriptor __host__ __device__ static constexpr Array GetMultiIndexFrom1dIndex(index_t id) { - constexpr auto dummy_desc = make_ConstantTensorDescriptor_packed(GetLengths()); + constexpr auto packed_desc = make_ConstantTensorDescriptor_packed(GetLengths()); - return dummy_desc.GetMultiIndexFrom1dIndex(id); + return packed_desc.GetMultiIndexFrom1dIndex(id); } }; diff --git a/src/include/ConstantTensorDescriptor.hip.hpp b/src/include/ConstantTensorDescriptor.hip.hpp index afafab8e0e..3d6ea12f2b 100644 --- a/src/include/ConstantTensorDescriptor.hip.hpp +++ b/src/include/ConstantTensorDescriptor.hip.hpp @@ -57,17 +57,38 @@ struct ConstantTensorDescriptor return Strides{}.Get(Number{}); } - __host__ __device__ static constexpr bool AreStridesNonAscending() + struct lambda_AreDimensionsContinuous { - bool flag = true; + bool& is_continuous; - static_for<0, nDim - 1, 1>{}([&](auto IDim) { - constexpr auto IDim_p1 = Number{}; + __host__ __device__ constexpr lambda_AreDimensionsContinuous(bool& is_continuous_) + : is_continuous(is_continuous_) + { + } - flag = flag && (GetLength(IDim) >= GetLength(IDim_p1)); - }); + template + __host__ __device__ constexpr void operator()(X IDim) const + { + constexpr auto IDim_p1 = IDim + Number<1>{}; - return flag; + is_continuous = + is_continuous && (GetStride(IDim) >= GetStride(IDim_p1) && + GetStride(IDim) == GetStride(IDim_p1) * GetLength(IDim_p1)); + } + }; + + __host__ __device__ static constexpr bool AreDimensionsContinuous() + { + bool is_continuous = true; + + static_for<0, nDim - 1, 1>{}(lambda_AreDimensionsContinuous(is_continuous)); + + return is_continuous; + } + + __host__ __device__ static constexpr bool IsPackedTensor() + { + return AreDimensionsContinuous() && GetStride(Number{}) == 1; } template @@ -92,40 +113,24 @@ struct ConstantTensorDescriptor return align.Get() * ((element_space_unaligned + align.Get() - 1) / align.Get()); } -#if 0 + // emulate constexpr lambda template - __host__ __device__ static constexpr index_t - GetOffsetFromMultiIndex(Array multi_id) + struct lambda_GetOffsetFromMultiIndex { - static_assert(NSize == nDim, "wrong! Dimension not consistent"); + Array& multi_id; + index_t& offset; - index_t offset = 0; - - static_for<0, nDim, 1>{}([&](auto IDim) { - constexpr index_t idim = IDim.Get(); - offset += multi_id[idim] * GetStride(IDim); - }); - - return offset; - } -#else - template - struct GetOffsetFromMultiIndex_impl - { - Array& multi_id_ref; - index_t& offset_ref; - - __host__ __device__ constexpr GetOffsetFromMultiIndex_impl(Array& multi_id, - index_t& offset) - : multi_id_ref(multi_id), offset_ref(offset) + __host__ + __device__ constexpr lambda_GetOffsetFromMultiIndex(Array& multi_id_, + index_t& offset_) + : multi_id(multi_id_), offset(offset_) { } - template - __host__ __device__ constexpr bool operator()(Number) const + template + __host__ __device__ constexpr void operator()(X IDim) const { - offset_ref += multi_id_ref.Get(Number{}) * Type::GetStride(Number{}); - return true; + offset += multi_id.Get(IDim) * Type::GetStride(IDim); } }; @@ -137,11 +142,10 @@ struct ConstantTensorDescriptor index_t offset = 0; - static_for<0, nDim, 1>{}(GetOffsetFromMultiIndex_impl(multi_id, offset)); + static_for<0, nDim, 1>{}(lambda_GetOffsetFromMultiIndex(multi_id, offset)); return offset; } -#endif template __host__ __device__ static constexpr index_t GetOffsetFromMultiIndex(Is... is) @@ -160,47 +164,26 @@ struct ConstantTensorDescriptor multi_id * GetStrides(), mod_conv::plus{}, Number<0>{}); } -#if 0 - __host__ __device__ static constexpr Array GetMultiIndexFrom1dIndex(index_t id) + // emulate constexpr lambda + template + struct lambda_GetMultiIndexFrom1dIndex { - Array multi_id; + index_t& id; + Array& multi_id; - constexpr auto dummy_strides = calculate_tensor_strides_packed(GetLengths()); - - // calculate index in each of the dimensions in the order of their dimension - static_for<0, nDim - 1, 1>{}([&](auto IDim) { - constexpr index_t idim = IDim.Get(); - constexpr index_t stride = dummy_strides.Get(Number{}); - multi_id[idim] = id / stride; - id -= multi_id[idim] * stride; - }); - - multi_id[nDim - 1] = id / dummy_strides.Get(Number{}); - - return multi_id; - } -#else - struct GetMultiIndexFrom1dIndex_impl - { - using DummyStrides = decltype(calculate_tensor_strides_packed(GetLengths())); - - index_t& id_ref; - Array& multi_id_ref; - - __host__ __device__ constexpr GetMultiIndexFrom1dIndex_impl(index_t& id, - Array& multi_id) - : id_ref(id), multi_id_ref(multi_id) + __host__ + __device__ constexpr lambda_GetMultiIndexFrom1dIndex(index_t& id_, + Array& multi_id_) + : id(id_), multi_id(multi_id_) { } - template - __host__ __device__ constexpr bool operator()(Number) const + template + __host__ __device__ constexpr void operator()(X IDim) const { - constexpr index_t stride = DummyStrides::Get(Number{}); - multi_id_ref.Set(Number{}, id_ref / stride); - id_ref -= multi_id_ref.Get(Number{}) * stride; - - return true; + constexpr index_t stride = PackedStrides::Get(IDim); + multi_id.Set(IDim, id / stride); + id -= multi_id[IDim] * stride; } }; @@ -208,27 +191,15 @@ struct ConstantTensorDescriptor { Array multi_id; - constexpr auto dummy_strides = calculate_tensor_strides_packed(GetLengths()); + using PackedStrides = decltype(calculate_tensor_strides_packed(GetLengths())); // calculate index in each of the dimensions in the order of their dimension - static_for<0, nDim - 1, 1>{}(GetMultiIndexFrom1dIndex_impl(id, multi_id)); + static_for<0, nDim - 1, 1>{}(lambda_GetMultiIndexFrom1dIndex(id, multi_id)); - index_t itmp = id / dummy_strides.Get(Number{}); - - multi_id.Set(Number{}, itmp); + multi_id.Set(Number{}, id / PackedStrides::Get(Number{})); return multi_id; } -#endif - -#if 0 - // return type is Sequence<...> - template - __host__ __device__ static constexpr auto GetMultiIndexFrom1dIndex(Number) - { - return inclusive_scan_sequence(f_impl, GetStrides(), Number{}); - } -#endif __host__ __device__ static constexpr auto GetOriginalMultiIndexFromMultiIndex(Array multi_id) @@ -236,9 +207,10 @@ struct ConstantTensorDescriptor return multi_id; } - // This function doesn't do carry check on the highest dimension, for performance reason. - // It is the user's responsibility to make sure the result "new_mutli_id" is not out-of-bound - // on the highest dimension + // This function doesn't do carry check on the highest dimension for positive stepping (or + // borrow check on the lowest dimension for negative stepping) , for performance reason. It is + // the user's responsibility to make sure the result "new_mutli_id" is not out-of-bound on the + // highest dimension for positive stepping (or on the lowest dimension for negative stepping) template __host__ __device__ static Array UpdateMultiIndexGivenStepSizeOf1dIndex(Array old_multi_id, @@ -262,14 +234,14 @@ struct ConstantTensorDescriptor if(carry) { - ++new_multi_id[idim]; + ++new_multi_id(idim); } carry = false; if(new_multi_id[idim] >= GetLength(IDim)) { - new_multi_id[idim] -= GetLength(IDim); + new_multi_id(idim) -= GetLength(IDim); carry = true; } }); @@ -288,14 +260,14 @@ struct ConstantTensorDescriptor if(borrow) { - --new_multi_id[idim]; + --new_multi_id(idim); } borrow = false; if(new_multi_id[idim] < GetLength(IDim)) { - new_multi_id[idim] += GetLength(IDim); + new_multi_id(idim) += GetLength(IDim); borrow = true; } }); @@ -382,15 +354,7 @@ struct ConstantTensorDescriptor return ConstantTensorDescriptor{}; } - template - struct f_unfold_impl - { - __host__ __device__ constexpr index_t operator()(index_t x) const - { - return x > Threashold ? x - Delta : x; - } - }; - + // this function unfold dimension [FirstUnfoldDim, ..., LastUnfoldDim] into 1 dimension template __host__ __device__ static constexpr auto Unfold(Number, Number) { @@ -398,24 +362,6 @@ struct ConstantTensorDescriptor FirstUnfoldDim <= LastUnfoldDim, "wrong! should have FirstUnfoldDim <= LastUnfoldDim!"); -#if 0 // cannot compile: compiler complain about constexpr - // dimensions to be unfold need to be in descending order (w.r.t. strides), and need to be - // packed in memory, otherwise, unfolding is invalid - static_for{}([&](auto IDim_) { - constexpr auto IDim = decltype(IDim_){}; - constexpr auto IDim_p1 = IDim + Number<1>{}; - - // check stride - static_assert( - GetStride(IDim) >= GetStride(IDim_p1), - "wrong! dimensions to be unfolded need to be in descending order w.r.t strides"); - - // check if packed - static_assert(GetStride(IDim_p1) * GetLength(IDim_p1) == GetStride(IDim), - "wrong! dimensions to be unfolded need to be packed"); - }); -#endif - // left and right constexpr auto left = typename arithmetic_sequence_gen<0, FirstUnfoldDim, 1>::SeqType{}; constexpr auto middle = @@ -423,6 +369,9 @@ struct ConstantTensorDescriptor constexpr auto right = typename arithmetic_sequence_gen::SeqType{}; + // dimensions to be unfolded need to be continuous + static_assert(Type::Extract(middle).AreDimensionsContinuous(), "wrong! not unfoldable"); + // unfolded length, stride constexpr index_t unfold_length = accumulate_on_sequence( GetLengths().Extract(middle), mod_conv::multiplies{}, Number<1>{}); @@ -446,16 +395,16 @@ struct ConstantTensorDescriptor template __host__ __device__ static constexpr auto ReorderGivenNew2Old(MapNew2Old) { - return ConstantTensorDescriptor{}; + return ConstantTensorDescriptor{}; } #if 0 // require sequence_sort, which is not implemented yet template __host__ __device__ static constexpr auto ReorderGivenOld2New(MapOld2New) { - return ConstantTensorDescriptor{} + return ConstantTensorDescriptor{} } #endif }; diff --git a/src/include/Sequence.hip.hpp b/src/include/Sequence.hip.hpp index e5edec4ba6..fdb3634363 100644 --- a/src/include/Sequence.hip.hpp +++ b/src/include/Sequence.hip.hpp @@ -16,7 +16,23 @@ struct Sequence { static_assert(I < mSize, "wrong! I too large"); - // the last dummy element is to prevent compiler complain about empty Sequence + // the last dummy element is to prevent compiler complain about empty array, when mSize = 0 + const index_t mData[mSize + 1] = {Is..., 0}; + return mData[I]; + } + + template + __host__ __device__ constexpr index_t operator[](Number) const + { + static_assert(I < mSize, "wrong! I too large"); + + const index_t mData[mSize + 1] = {Is..., 0}; + return mData[I]; + } + + // make sure I is constepxr + __host__ __device__ constexpr index_t operator[](index_t I) const + { const index_t mData[mSize + 1] = {Is..., 0}; return mData[I]; } @@ -30,6 +46,9 @@ struct Sequence "wrong! invalid new2old map"); #endif + static_assert(sizeof...(Is) == sizeof...(IRs), + "wrong! new2old map should have the same size as Sequence to be rerodered"); + return Sequence{})...>{}; } @@ -322,11 +341,6 @@ __host__ __device__ constexpr auto operator-(Sequence seq_x, Sequence{}( - [&](auto I) { static_assert(seq_x.Get(I) >= seq_y.Get(I), "wrong! going to undeflow"); }); -#endif - return Sequence<(Xs - Ys)...>{}; } @@ -363,15 +377,6 @@ __host__ __device__ constexpr auto operator+(Sequence, Number) template __host__ __device__ constexpr auto operator-(Sequence, Number) { -#if 0 // TODO: turn it on. Doesn't compile - constexpr auto seq_x = Sequence{}; - - static_for<0, sizeof...(Xs), 1>{}([&](auto Iter) { - constexpr auto I = decltype(Iter){}; - static_assert(seq_x.Get(I) >= Y, "wrong! going to underflow"); - }); -#endif - return Sequence<(Xs - Y)...>{}; } @@ -404,13 +409,6 @@ __host__ __device__ constexpr auto operator-(Number, Sequence) { constexpr auto seq_x = Sequence{}; -#if 0 - static_for<0, sizeof...(Xs), 1>{}([&](auto Iter) { - constexpr auto I = decltype(Iter){}; - static_assert(seq_x.Get(I) <= Y, "wrong! going to underflow"); - }); -#endif - return Sequence<(Y - Xs)...>{}; } @@ -482,25 +480,6 @@ __host__ __device__ constexpr auto inclusive_scan_sequence(Seq, Reduce, Number{}).Reverse(); } -template -struct accumulate_on_sequence_impl -{ - template - __host__ __device__ constexpr index_t operator()(IDim) const - { - return Seq{}.Get(IDim{}); - } -}; - -template -__host__ __device__ constexpr index_t - accumulate_on_sequence(Seq, Reduce, Number /*initial_value*/) -{ - constexpr index_t a = - static_const_reduce_n{}(accumulate_on_sequence_impl{}, Reduce{}); - return Reduce{}(a, I); -} - template __host__ __device__ constexpr auto Sequence::PopFront() { diff --git a/src/include/blockwise_generic_tensor_slice_op.hip.hpp b/src/include/blockwise_generic_tensor_slice_op.hip.hpp index d3ef8299d9..5665fb2726 100644 --- a/src/include/blockwise_generic_tensor_slice_op.hip.hpp +++ b/src/include/blockwise_generic_tensor_slice_op.hip.hpp @@ -122,7 +122,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 constexpr auto src_partial_original_desc = SrcDesc::GetOriginalTensorDescriptor().Extract(src_partial_original_dims); - mThreadSrcPartialOffsets[idim] = src_partial_original_desc.GetOffsetFromMultiIndex( + mThreadSrcPartialOffsets(idim) = src_partial_original_desc.GetOffsetFromMultiIndex( extract_array(mThreadSrcOriginalMultiId, src_partial_original_dims)); }); @@ -136,7 +136,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 constexpr auto dst_partial_original_desc = DstDesc::GetOriginalTensorDescriptor().Extract(dst_partial_original_dims); - mThreadDstPartialOffsets[idim] = dst_partial_original_desc.GetOffsetFromMultiIndex( + mThreadDstPartialOffsets(idim) = dst_partial_original_desc.GetOffsetFromMultiIndex( extract_array(mThreadDstOriginalMultiId, dst_partial_original_dims)); }); @@ -369,7 +369,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 constexpr auto I = decltype(I_){}; constexpr index_t idim_original = src_partial_original_dims.Get(I); - mThreadSrcOriginalMultiId[idim_original] = + mThreadSrcOriginalMultiId(idim_original) = new_src_partial_original_multi_id[I.Get()]; }); @@ -381,7 +381,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 new_src_partial_original_multi_id); // update "mThreadSrcPartialOffsets" - mThreadSrcPartialOffsets[idim] = new_src_partial_offset; + mThreadSrcPartialOffsets(idim) = new_src_partial_offset; // update "mThreadSrcOffset", do "+" before "-" to avoid underflow mThreadSrcOffset = (mThreadSrcOffset + new_src_partial_offset) - old_src_partial_offset; @@ -401,15 +401,15 @@ struct BlockwiseGenericTensorSliceCopy_v1 static_if{}([&](auto fwd) { mThreadSrcOffset += StepSize * fwd(SrcDesc{}).GetStride(IDim); - mThreadSrcOriginalMultiId[idim_original] += StepSize; + mThreadSrcOriginalMultiId(idim_original) += StepSize; - mThreadSrcPartialOffsets[idim] += StepSize * fwd(SrcDesc{}).GetStride(IDim); + mThreadSrcPartialOffsets(idim) += StepSize * fwd(SrcDesc{}).GetStride(IDim); }).Else([&](auto fwd) { mThreadSrcOffset -= StepSize * fwd(SrcDesc{}).GetStride(IDim); - mThreadSrcOriginalMultiId[idim_original] -= StepSize; + mThreadSrcOriginalMultiId(idim_original) -= StepSize; - mThreadSrcPartialOffsets[idim] -= StepSize * fwd(SrcDesc{}).GetStride(IDim); + mThreadSrcPartialOffsets(idim) -= StepSize * fwd(SrcDesc{}).GetStride(IDim); }); }); } diff --git a/src/include/common.hip.hpp b/src/include/common.hip.hpp index bc8df3bc5a..28ff7003bf 100644 --- a/src/include/common.hip.hpp +++ b/src/include/common.hip.hpp @@ -110,7 +110,7 @@ __host__ __device__ constexpr T min(T x, Ts... xs) // this is wrong // TODO: implement correct least common multiple, instead of calling max() template -__host__ __device__ constexpr T least_common_multiple(T x, Ts... xs) +__host__ __device__ constexpr T lcm(T x, Ts... xs) { return max(x, xs...); } diff --git a/src/include/functional.hip.hpp b/src/include/functional.hip.hpp index ab811d292c..776abe0b2a 100644 --- a/src/include/functional.hip.hpp +++ b/src/include/functional.hip.hpp @@ -19,18 +19,7 @@ struct swallow } }; -#if 0 -template -__host__ __device__ constexpr auto unpacker(F f) -{ - return [=](auto xs_array){ f(xs...); }; -} -#endif - -// Emulate compile time if statement for C++14 -// Get the idea from -// "https://baptiste-wicht.com/posts/2015/07/simulate-static_if-with-c11c14.html" -// TODO: use if constexpr, when C++17 is supported +// Emulate if constexpr template struct static_if { @@ -81,28 +70,3 @@ struct static_if return Type{}; } }; - -template -struct static_const_reduce_n -{ - // signature of F: F(Number) - template - __host__ __device__ constexpr auto operator()(F f, Reduce r) const - { - static_assert(NLoop > 1, "out-of-range"); - - constexpr auto a = f(Number{}); - auto b = static_const_reduce_n{}(f, r); // TODO: cannot use constexpr here, weird - return r(a, b); - } -}; - -template <> -struct static_const_reduce_n<1> -{ - template - __host__ __device__ constexpr auto operator()(F f, Reduce) const - { - return f(Number<0>{}); - } -}; diff --git a/src/include/functional2.hip.hpp b/src/include/functional2.hip.hpp index e307f31f60..6633abd316 100644 --- a/src/include/functional2.hip.hpp +++ b/src/include/functional2.hip.hpp @@ -2,53 +2,6 @@ #include "functional.hip.hpp" #include "Sequence.hip.hpp" -#if 0 -template -struct static_for_impl -{ - template - constexpr __host__ __device__ void operator()(F f) const - { - static_assert(Remaining % Increment == 0, "wrong! Remaining % Increment != 0"); - static_assert(Increment <= Remaining, "will go out-of-range"); - - f(Number{}); - static_for_impl{}(f); - } -}; - -template -struct static_for_impl -{ - template - constexpr __host__ __device__ void operator()(F) const - { - // no work left, just return - return; - } -}; - -// F signature: F(Number) -template -struct static_for -{ - template - constexpr __host__ __device__ void operator()(F f) const - { - static_assert(NBegin <= NEnd, "wrongs! should have NBegin <= NEnd"); - - static_assert((NEnd - NBegin) % Increment == 0, - "Wrong! should satisfy (NEnd - NBegin) % Increment == 0"); - -#if 0 - static_if<(NBegin < NEnd)>{}( - [&](auto fwd) { static_for_impl{}(f); }); -#else - static_for_impl{}(f); -#endif - } -}; -#else template struct static_for_impl; @@ -77,4 +30,32 @@ struct static_for static_for_impl::SeqType>{}(f); } }; -#endif + +template +struct lambda_accumulate_on_sequence +{ + const Reduce& f; + index_t& result; + + __host__ __device__ constexpr lambda_accumulate_on_sequence(const Reduce& f_, index_t& result_) + : f(f_), result(result_) + { + } + + template + __host__ __device__ constexpr index_t operator()(IDim) const + { + return result = f(result, Seq::Get(IDim{})); + } +}; + +template +__host__ __device__ constexpr index_t +accumulate_on_sequence(Seq, Reduce f, Number /*initial_value*/) +{ + index_t result = Init; + + static_for<0, Seq::mSize, 1>{}(lambda_accumulate_on_sequence(f, result)); + + return result; +} diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hip.hpp index 72a45fefe8..1fe5492901 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hip.hpp @@ -103,7 +103,7 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn // tensor view of blockwise input and weight in LDS // be careful of alignment - constexpr index_t max_align = mod_conv::max(InBlockCopyDataPerRead_N, + constexpr index_t max_align = mod_conv::lcm(InBlockCopyDataPerRead_N, WeiBlockCopyDataPerRead_K, GemmDataPerReadA, GemmDataPerReadB); @@ -119,11 +119,11 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn constexpr auto wei_cyx_k_block_desc = make_ConstantTensorDescriptor_aligned( Sequence{}, - Number{}); + Number{}); constexpr auto wei_c_y_x_k_block_desc = make_ConstantTensorDescriptor_aligned( Sequence{}, - Number{}); + Number{}); // tensor view of threadwise output in register constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor( diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hip.hpp index 22e0351d84..a0ad963585 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hip.hpp @@ -104,7 +104,7 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn // LDS tensor view // be careful of alignment - constexpr index_t max_align = mod_conv::max(InBlockCopyDataPerRead_N, + constexpr index_t max_align = mod_conv::lcm(InBlockCopyDataPerRead_N, WeiBlockCopyDataPerRead_K, GemmDataPerReadA, GemmDataPerReadB); @@ -120,7 +120,7 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn constexpr auto wei_c_x_k_block_desc = make_ConstantTensorDescriptor_aligned( Sequence{}, - Number{}); + Number{}); // tensor view of threadwise output in register constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor( diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r2_nchw_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r2_nchw_cyxk_khwn.hip.hpp index 89a47b9e64..bfc6f951e0 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r2_nchw_cyxk_khwn.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r2_nchw_cyxk_khwn.hip.hpp @@ -108,7 +108,7 @@ struct GridwiseConvolutionImplicitGemm_v1r2_nchw_cyxk_khwn // LDS tensor view // be careful of alignment - constexpr index_t max_align = mod_conv::max(InBlockReorderDataPerWrite_N, + constexpr index_t max_align = mod_conv::lcm(InBlockReorderDataPerWrite_N, WeiBlockCopyDataPerRead_K, GemmDataPerReadA, GemmDataPerReadB); diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hip.hpp index 0f12df21f3..6e140a421a 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hip.hpp @@ -99,7 +99,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn // LDS tensor view // be careful of alignment - constexpr index_t max_align = mod_conv::max(InBlockCopyDataPerRead_N, + constexpr index_t max_align = mod_conv::lcm(InBlockCopyDataPerRead_N, WeiBlockCopyDataPerRead_K, GemmDataPerReadA, GemmDataPerReadB); @@ -115,7 +115,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned( Sequence{}, - Number{}); + Number{}); // tensor view of threadwise output in register constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor( diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp index af7f841644..d4f341a04e 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp @@ -104,7 +104,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn // LDS tensor view // be careful of alignment - constexpr index_t max_align = mod_conv::max(InBlockCopyDataPerRead_N, + constexpr index_t max_align = mod_conv::lcm(InBlockCopyDataPerRead_N, WeiBlockCopyDataPerRead_K, GemmDataPerReadA, GemmDataPerReadB); @@ -120,7 +120,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned( Sequence{}, - Number{}); + Number{}); // tensor view of threadwise output in register constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_packed( diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp index e1d23053f1..6ebeb9ccca 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp @@ -106,7 +106,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn // LDS tensor view // be careful of alignment - constexpr index_t max_align = mod_conv::max(InBlockReorderDataPerWrite_N, + constexpr index_t max_align = mod_conv::lcm(InBlockReorderDataPerWrite_N, WeiBlockCopyDataPerRead_K, GemmDataPerReadA, GemmDataPerReadB); @@ -122,7 +122,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned( Sequence{}, - Number{}); + Number{}); // tensor view of threadwise output in register constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_packed( diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp index aa7faac964..4a7ea3d2a2 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp @@ -105,7 +105,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw // LDS tensor view // be careful of alignment - constexpr index_t max_align = mod_conv::max(InBlockReorderDataPerWrite_N, + constexpr index_t max_align = mod_conv::lcm(InBlockReorderDataPerWrite_N, WeiBlockCopyDataPerRead_K, GemmDataPerReadA, GemmDataPerReadB); @@ -121,7 +121,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned( Sequence{}, - Number{}); + Number{}); // tensor view of threadwise output in register constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_packed( diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hip.hpp index d192672b25..bb17e25114 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hip.hpp @@ -104,7 +104,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn // LDS tensor view // be careful of alignment - constexpr index_t max_align = mod_conv::max(InBlockReorderDataPerWrite_N, + constexpr index_t max_align = mod_conv::lcm(InBlockReorderDataPerWrite_N, WeiBlockCopyDataPerRead_K, GemmDataPerReadA, GemmDataPerReadB); @@ -120,7 +120,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned( Sequence{}, - Number{}); + Number{}); // tensor view of threadwise output in register constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor( diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp index 8b2c3f388b..4af9073168 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp @@ -103,7 +103,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw // LDS tensor view // be careful of alignment - constexpr index_t max_align = mod_conv::max(InBlockReorderDataPerWrite_N, + constexpr index_t max_align = mod_conv::lcm(InBlockReorderDataPerWrite_N, WeiBlockCopyDataPerRead_K, GemmDataPerReadA, GemmDataPerReadB); @@ -119,7 +119,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned( Sequence{}, - Number{}); + Number{}); // tensor view of threadwise output in register constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_packed( diff --git a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp index 231fbbe448..3ac9c71177 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp @@ -181,7 +181,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn // LDS: be careful of alignment constexpr index_t max_align = - mod_conv::max(index_t(4), InBlockCopyDataPerRead, WeiBlockCopyDataPerRead); + mod_conv::lcm(index_t(4), InBlockCopyDataPerRead, WeiBlockCopyDataPerRead); constexpr index_t in_block_space = in_cb_block_desc.GetElementSpace(Number{}); diff --git a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp index f87caf3816..90cf8ea937 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp @@ -185,7 +185,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer // LDS: be careful of alignment constexpr index_t max_align = - mod_conv::max(index_t(4), InBlockCopyDataPerRead, WeiBlockCopyDataPerRead); + mod_conv::lcm(index_t(4), InBlockCopyDataPerRead, WeiBlockCopyDataPerRead); constexpr index_t in_block_space = in_cb_block_desc.GetElementSpace(Number{}); diff --git a/src/include/gridwise_convolution_implicit_gemm_v3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp index 25e61f9e35..a841eaf9b4 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp @@ -5,9 +5,8 @@ #include "ConstantMatrixDescriptor.hip.hpp" #include "blockwise_generic_tensor_slice_op.hip.hpp" #include "blockwise_gemm.hip.hpp" -#include "threadwise_tensor_slice_op.hip.hpp" -// define B = merge(N, Ho, Wo) +// define B = merge(N0, Ho, Wo) template {}, - Number{}); + Number{}); // operator for blockwise copy of weight into LDS // slice a tensor, and copy it into another tensor // this copy operator already have blockwise offset built-in const auto blockwise_wei_copy = -#if 0 BlockwiseGenericTensorSliceCopy_v1, // thread_arrange_order [C, K] - Sequence<0, 1>, // src_access_order [C, K] - Sequence<0, 1>, // dst_access_order [C, K] - WeiBlockCopyDataPerAccess_K, - WeiBlockCopyDataPerAccess_K>( + Float, + decltype(wei_c_k_global_desc), + decltype(wei_c_k_block_desc), + decltype(wei_c_k_block_desc.GetLengths()), + WeiBlockCopySubLengths_C_K, + WeiBlockCopyClusterLengths_C_K, + Sequence<0, 1>, // thread_arrange_order [C, K] + Sequence<0, 1>, // src_access_order [C, K] + Sequence<0, 1>, // dst_access_order [C, K] + WeiBlockCopyDataPerAccess_K, + WeiBlockCopyDataPerAccess_K>( {0, k_block_data_on_global}, {0, 0}); -#else - Blockwise2dTensorCopy3({0, k_block_data_on_global}, - {0, 0}); -#endif - // GEMM definition - // c_mtx += transpose(a_mtx) * b_mtx - // a_mtx[CPerBlock, KPerBlock] is in LDS - // b_mtx[CPerBlocl, N1 * BPerBlock * N2] is in LDS - // c_mtx[KPerBlock, N1 * BPerBlock * N2] is distributed among threads, and saved in - // register - constexpr auto a_c_k_block_mtx_desc = - make_ConstantMatrixDescriptor(Number{}, - Number{}, - Number{}); + // GEMM definition + // c_mtx += transpose(a_mtx) * b_mtx + // a_mtx[CPerBlock, KPerBlock] is in LDS + // b_mtx[CPerBlocl, N1 * BPerBlock * N2] is in LDS + // c_mtx[KPerBlock, N1 * BPerBlock * N2] is distributed among threads, and saved in + // register + constexpr auto a_c_k_block_mtx_desc = make_ConstantMatrixDescriptor( + Number{}, Number{}, Number{}); constexpr auto b_c_n1bn2_block_mtx_desc = make_ConstantMatrixDescriptor(Number{}, @@ -228,7 +215,7 @@ struct GridwiseConvolutionImplicitGemm_v3_lds_double_buffer_nchw_cyxk_nkhw }; // LDS allocation for input and weight: be careful of alignment - constexpr index_t max_align = mod_conv::max(InBlockCopyDstDataPerWrite_N2, + constexpr index_t max_align = mod_conv::lcm(InBlockCopyDstDataPerWrite_N2, WeiBlockCopyDataPerAccess_K, GemmDataPerReadA, GemmDataPerReadB); @@ -261,18 +248,8 @@ struct GridwiseConvolutionImplicitGemm_v3_lds_double_buffer_nchw_cyxk_nkhw // LDS double buffer: preload data into LDS { - Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()]; - Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()]; - - blockwise_in_copy.RunLoadRegisterClipboard(p_in_block_on_global, - p_in_register_clipboard); - blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_block_on_global, - p_wei_register_clipboard); - - blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, - p_in_block_double); - blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard, - p_wei_block_double); + blockwise_in_copy.Run(p_in_block_on_global, p_in_block_double); + blockwise_wei_copy.Run(p_wei_block_on_global, p_wei_block_double); } // LDS double buffer: main body @@ -413,7 +390,8 @@ struct GridwiseConvolutionImplicitGemm_v3_lds_double_buffer_nchw_cyxk_nkhw p_out_thread_on_global, {0, 0, 0, 0, 0, 0, 0, 0}, out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths(), - arithmetic_sequence_gen<0, 8, 1>::SeqType{}); + arithmetic_sequence_gen<0, 8, 1>::SeqType{}, + Number<1>{}); } } }; diff --git a/src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp index d999302232..42307e1100 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp @@ -5,9 +5,8 @@ #include "ConstantMatrixDescriptor.hip.hpp" #include "blockwise_generic_tensor_slice_op.hip.hpp" #include "blockwise_gemm.hip.hpp" -#include "threadwise_tensor_slice_op.hip.hpp" -// define B = merge(N, Ho, Wo) +// define B = merge(N0, Ho, Wo) template {}, - Number{}); + Number{}); // operator for blockwise copy of weight into LDS // slice a tensor, and copy it into another tensor // this copy operator already have blockwise offset built-in auto blockwise_wei_copy = -#if 1 BlockwiseGenericTensorSliceCopy_v1( {0, k_block_data_on_global}, {0, 0}); -#else - Blockwise2dTensorCopy3({0, k_block_data_on_global}, - {0, 0}); -#endif // GEMM definition // c_mtx += transpose(a_mtx) * b_mtx @@ -219,8 +208,17 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw GemmDataPerReadA, GemmDataPerReadB>{}; + // choose GEMM implementation here + const auto run_blockwise_gemm = [&](auto... Xs) { +#if 1 + return blockwise_gemm.Run(Xs...); +#else + return blockwise_gemm.Run_asm(Xs...); +#endif + }; + // LDS allocation for input and weight: be careful of alignment - constexpr index_t max_align = mod_conv::max(InBlockCopyDstDataPerWrite_N2, + constexpr index_t max_align = mod_conv::lcm(InBlockCopyDstDataPerWrite_N2, WeiBlockCopyDataPerAccess_K, GemmDataPerReadA, GemmDataPerReadB); @@ -264,7 +262,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw __syncthreads(); - blockwise_gemm.Run(p_wei_block, p_in_block, p_out_thread); + run_blockwise_gemm(p_wei_block, p_in_block, p_out_thread); __syncthreads(); } @@ -294,7 +292,6 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw __syncthreads(); - // move on C: C_N1_B_N2, C_K blockwise_in_copy.MoveSlicingWindowOnSourceTensor( I0, Number{}, True); @@ -366,7 +363,8 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw p_out_thread_on_global, {0, 0, 0, 0, 0, 0, 0, 0}, out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths(), - arithmetic_sequence_gen<0, 8, 1>::SeqType{}); + arithmetic_sequence_gen<0, 8, 1>::SeqType{}, + Number<1>{}); } } }; diff --git a/src/include/gridwise_convolution_implicit_gemm_v4_lds_double_buffer_nchw_kcyx_nkhw.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v4_lds_double_buffer_nchw_kcyx_nkhw.hip.hpp index 3cb67d4058..03916e874e 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v4_lds_double_buffer_nchw_kcyx_nkhw.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v4_lds_double_buffer_nchw_kcyx_nkhw.hip.hpp @@ -7,7 +7,7 @@ #include "blockwise_gemm.hip.hpp" #include "threadwise_generic_tensor_slice_op.hip.hpp" -// define B = merge(N, Ho, Wo) +// define B = merge(N0, Ho, Wo) template {}, - Number{}); + Number{}); -// operator for blockwise copy of weight into LDS -// slice a tensor, and copy it into another tensor -// this copy operator already have blockwise offset built-in -#if 1 + // operator for blockwise copy of weight into LDS + // slice a tensor, and copy it into another tensor + // this copy operator already have blockwise offset built-in auto blockwise_wei_copy = BlockwiseGenericTensorSliceCopy_v1( {0, k_block_data_on_global}, {0, 0}); -#else - constexpr auto map_k_e_2_e_k = Sequence<1, 0>{}; - - auto blockwise_wei_copy = BlockwiseTensorSliceReorderCopy_v3< - BlockSize, - Float, - decltype(wei_e_k_global_desc.ReorderGivenNew2Old(map_k_e_2_e_k)), - decltype(wei_e_k_block_desc), - decltype(wei_e_k_block_desc.GetLengths().ReorderGivenNew2Old(map_k_e_2_e_k)), - decltype(WeiBlockCopySubLengths_E_K::ReorderGivenNew2Old(map_k_e_2_e_k)), - decltype(WeiBlockCopyClusterLengths_E_K::ReorderGivenNew2Old(map_k_e_2_e_k)), - Sequence<1, 0>, // MapDst2Src - WeiBlockCopyThreadClusterArrangeOrder, - WeiBlockCopySrcDataPerRead_E, - WeiBlockCopyDstDataPerWrite_K>({k_block_data_on_global, 0}, {0, 0}); -#endif // GEMM definition // c_mtx += transpose(a_mtx) * b_mtx @@ -254,7 +237,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw }; // LDS allocation for input and weight: be careful of alignment - constexpr index_t max_align = mod_conv::max(InBlockCopyDstDataPerWrite_N2, + constexpr index_t max_align = mod_conv::lcm(InBlockCopyDstDataPerWrite_N2, WeiBlockCopyDstDataPerWrite_K, GemmDataPerReadA, GemmDataPerReadB); @@ -273,18 +256,6 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw // zero out threadwise output threadwise_matrix_set_zero(c_k0k2_n1n2_thread_mtx_desc, p_out_thread); -#if 0 - if(get_block_1d_id() == 0) - { - printf("id %5u %5u: " - "mThreadSrcOffset %u, mThreadDstOffset %u \n", - get_block_1d_id(), - get_thread_local_1d_id(), - blockwise_wei_copy.mThreadSrcOffset, - blockwise_wei_copy.mThreadDstOffset); - } -#endif - const Float* p_wei_block_on_global = p_wei_global; // LDS double buffer: preload data into LDS diff --git a/src/include/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hip.hpp index 8fe8d8fac2..852ef978bd 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hip.hpp @@ -7,7 +7,7 @@ #include "blockwise_gemm.hip.hpp" #include "threadwise_generic_tensor_slice_op.hip.hpp" -// define B = merge(N, Ho, Wo) +// define B = merge(N0, Ho, Wo) template struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw @@ -146,19 +152,20 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw // input blockwise copy // slice a merged tensor, reorder and copy to a normal tensor // this copy operator already has blockwise offset built-in - auto blockwise_in_copy = BlockwiseGenericTensorSliceCopy_v1< - BlockSize, - Float, - decltype(in_e_n1_b_n2_global_merged_desc), - decltype(in_e_n1_b_n2_block_desc), - decltype(in_e_n1_b_n2_block_desc.GetLengths()), - InBlockCopySubLengths_E_N1_B_N2, - InBlockCopyClusterLengths_E_N1_B_N2, - Sequence<0, 1, 3, 2>, // thread_arrange_order [E, N1, N2, B] - Sequence<0, 1, 3, 2>, // src_access_order [E, N1, N2, B] - Sequence<0, 1, 2, 3>, // dst_access_order [E, N1, B, N2] - InBlockCopySrcDataPerRead_B, - InBlockCopyDstDataPerWrite_N2>({0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0}); + auto blockwise_in_copy = + BlockwiseGenericTensorSliceCopy_v1( + {0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0}); // weight tensor // tensor descriptor in device memory, src of blockwise copy @@ -169,7 +176,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw // be careful of LDS alignment constexpr auto wei_e_k_block_desc = make_ConstantTensorDescriptor_aligned( Sequence{}, - Number{}); + Number{}); // operator for blockwise copy of weight into LDS // slice a tensor, and copy it into another tensor @@ -182,9 +189,9 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw decltype(wei_e_k_block_desc.GetLengths()), WeiBlockCopySubLengths_E_K, WeiBlockCopyClusterLengths_E_K, - Sequence<1, 0>, // thread_arrange_order [K, E] - Sequence<1, 0>, // src_access_order [K, E] - Sequence<0, 1>, // dst_access_order [E, K] + WeiBlockCopyThreadClusterArrangeOrder, + WeiBlockCopySrcAccessOrder, + WeiBlockCopyDstAccessOrder, WeiBlockCopySrcDataPerRead_E, WeiBlockCopyDstDataPerWrite_K>( {0, k_block_data_on_global}, {0, 0}); @@ -231,8 +238,17 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw GemmDataPerReadA, GemmDataPerReadB>{}; + // choose GEMM implementation here + const auto run_blockwise_gemm = [&](auto... Xs) { +#if 1 + return blockwise_gemm.Run(Xs...); +#else + return blockwise_gemm.Run_asm(Xs...); +#endif + }; + // LDS allocation for input and weight: be careful of alignment - constexpr index_t max_align = mod_conv::max(InBlockCopyDstDataPerWrite_N2, + constexpr index_t max_align = mod_conv::lcm(InBlockCopyDstDataPerWrite_N2, WeiBlockCopyDstDataPerWrite_K, GemmDataPerReadA, GemmDataPerReadB); @@ -254,24 +270,13 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw // do work for(index_t e = 0; e < E; e += EPerBlock) { -#if 0 - if(e == 0 * EPerBlock && get_block_1d_id() == 0) - { - printf("id %5u %5u: " - "mThreadSrcOffset %u, mThreadDstOffset %u \n", - get_block_1d_id(), - get_thread_local_1d_id(), - blockwise_wei_copy.mThreadSrcOffset, - blockwise_wei_copy.mThreadDstOffset); - } -#endif // marching slicing window blockwise_in_copy.Run(p_in_global, p_in_block); blockwise_wei_copy.Run(p_wei_global, p_wei_block); __syncthreads(); - blockwise_gemm.Run(p_wei_block, p_in_block, p_out_thread); + run_blockwise_gemm(p_wei_block, p_in_block, p_out_thread); __syncthreads(); @@ -335,7 +340,8 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw p_out_thread_on_global, {0, 0, 0, 0, 0, 0, 0, 0}, out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths(), - arithmetic_sequence_gen<0, 8, 1>::SeqType{}); + arithmetic_sequence_gen<0, 8, 1>::SeqType{}, + Number<1>{}); } } }; diff --git a/src/include/integral_constant.hip.hpp b/src/include/integral_constant.hip.hpp index 0f134ae76c..7b872e07c0 100644 --- a/src/include/integral_constant.hip.hpp +++ b/src/include/integral_constant.hip.hpp @@ -8,7 +8,7 @@ struct integral_constant __host__ __device__ constexpr T Get() const { return value; } }; -template +template __host__ __device__ constexpr auto operator+(integral_constant, integral_constant) { return integral_constant{}; diff --git a/src/include/threadwise_generic_tensor_slice_op.hip.hpp b/src/include/threadwise_generic_tensor_slice_op.hip.hpp index 3803ab23ac..9a7e5ae062 100644 --- a/src/include/threadwise_generic_tensor_slice_op.hip.hpp +++ b/src/include/threadwise_generic_tensor_slice_op.hip.hpp @@ -62,7 +62,7 @@ __device__ void threadwise_generic_tensor_slice_copy_v1( #if 1 ford{}([&](auto access_multi_id) { auto data_multi_id_in_access_order = access_multi_id; - data_multi_id_in_access_order[nDim - 1] = access_multi_id[nDim - 1] * DataPerAccess; + data_multi_id_in_access_order(nDim - 1) = access_multi_id[nDim - 1] * DataPerAccess; const auto data_multi_id = reorder_array_given_old2new(data_multi_id_in_access_order, DimAccessOrder{});