From 33b5a8556b68c2cbeba555088b647310779d17e4 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Thu, 16 May 2019 22:23:18 -0500 Subject: [PATCH] adding implicit gemm v3 --- ...lution_implicit_gemm_v1_chwn_cyxk_khwn.hpp | 2 +- ...lution_implicit_gemm_v1_nchw_cyxk_khwn.hpp | 2 +- ...lution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp | 4 +- driver/driver.hip.cpp | 10 +- src/include/Array.hip.hpp | 14 ++ src/include/ConstantTensorDescriptor.hip.hpp | 60 ++--- src/include/Sequence.hip.hpp | 223 +++++++++--------- src/include/blockwise_gemm.hip.hpp | 3 +- src/include/blockwise_tensor_slice_op.hip.hpp | 33 +-- src/include/functional.hip.hpp | 8 +- ...3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp | 2 +- ...3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp | 4 +- ...3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp | 4 +- 13 files changed, 172 insertions(+), 197 deletions(-) diff --git a/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp b/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp index 938fba037a..4e739a001e 100644 --- a/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp +++ b/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp @@ -140,7 +140,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, constexpr index_t WeiBlockCopyDataPerRead_K = 4; constexpr index_t OutThreadCopyDataPerWrite_N = 2; -#elif 0 +#elif 1 // for 3x3, 34x34, v1r3, Pascal // for 3x3, 28x28, v1r3, Pascal // for 3x3, 14x14, v1r3, Pascal diff --git a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp index e5c20994f2..5ea9b8f030 100644 --- a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp +++ b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp @@ -64,7 +64,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc, wei_cyxk_device_buf.ToDevice(wei_cyxk.mData.data()); out_khwn_device_buf.ToDevice(out_khwn.mData.data()); -#if 0 +#if 1 // for 3x3, 34x34, v1r3, Pascal constexpr index_t BlockSize = 128; diff --git a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp index f366f6664c..34ed48229f 100644 --- a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp +++ b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp @@ -57,7 +57,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc, wei_cyxk_device_buf.ToDevice(wei_cyxk.mData.data()); out_nkhw_device_buf.ToDevice(out_nkhw.mData.data()); -#if 0 +#if 1 // for 3x3, 34x34, v1r3, Pascal constexpr index_t BlockSize = 128; @@ -162,7 +162,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc, constexpr index_t WeiBlockCopyDataPerRead_K = 4; constexpr index_t OutThreadCopyDataPerWrite_W = 2; -#elif 1 +#elif 0 // for 3x3, 34x34, v1r3, Vega 20, WoPerBlock = 8 constexpr index_t BlockSize = 256; diff --git a/driver/driver.hip.cpp b/driver/driver.hip.cpp index 0b75e0083a..3e032a2333 100644 --- a/driver/driver.hip.cpp +++ b/driver/driver.hip.cpp @@ -13,7 +13,7 @@ #include "device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp" #include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp" #include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp" -#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp" +//#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp" struct GeneratorTensor_1 { @@ -411,7 +411,7 @@ void check_error(const Tensor& ref, const Tensor& result) int main(int argc, char* argv[]) { -#if 0 +#if 1 // 3x3, 34x34 constexpr index_t N = 64; constexpr index_t C = 256; @@ -435,7 +435,7 @@ int main(int argc, char* argv[]) constexpr index_t HPad = 0; constexpr index_t WPad = 0; -#elif 1 +#elif 0 // 3x3 filter, 28x28 image constexpr index_t N = 128; constexpr index_t C = 256; @@ -608,7 +608,7 @@ int main(int argc, char* argv[]) device_convolution_direct_v2_nchw_kcyx_nkhw #elif 0 device_direct_convolution_2_vectorized_nchw_kcyx_nkhw -#elif 0 +#elif 1 device_convolution_implicit_gemm_v1_chwn_cyxk_khwn #elif 0 device_convolution_implicit_gemm_v1_nchw_cyxk_khwn @@ -616,7 +616,7 @@ int main(int argc, char* argv[]) device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw #elif 0 device_convolution_implicit_gemm_v2_chwn_cyxk_khwn -#elif 1 +#elif 0 device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw #endif (in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat); diff --git a/src/include/Array.hip.hpp b/src/include/Array.hip.hpp index 2370e4ed32..30e3bd0b7c 100644 --- a/src/include/Array.hip.hpp +++ b/src/include/Array.hip.hpp @@ -66,3 +66,17 @@ __host__ __device__ auto reorder_array_given_old2new(const Array& return new_array; } + +template +__host__ __device__ constexpr auto operator+(const Array& a, + const Array& b) +{ + Array result; + + static_for<0, NSize, 1>{}([&](auto I) { + constexpr index_t i = I.Get(); + result[i] = a[i] + b[i]; + }); + + return result; +} diff --git a/src/include/ConstantTensorDescriptor.hip.hpp b/src/include/ConstantTensorDescriptor.hip.hpp index 64c7f4408d..d61632a389 100644 --- a/src/include/ConstantTensorDescriptor.hip.hpp +++ b/src/include/ConstantTensorDescriptor.hip.hpp @@ -88,32 +88,11 @@ struct ConstantTensorDescriptor return accumulate_on_sequence(Lengths{}, std::multiplies{}, Number<1>{}); } -#if 0 - // c++14 doesn't support constexpr lambdas, has to use this trick instead - struct f_GetElementSpace_impl - { - template - __host__ __device__ constexpr index_t operator()(IDim idim) const - { - return (Type{}.GetLength(idim) - 1) * Type{}.GetStride(idim); - } - __host__ __device__ constexpr index_t operator()(index_t length, index_t stride) const - { - return (length - 1) * stride; - } - }; -#endif - template > __host__ __device__ static constexpr index_t GetElementSpace(Align align = Align{}) { -#if 0 - index_t element_space_unaligned = - static_const_reduce_n{}(f_GetElementSpace_impl{}, std::plus{}) + 1; -#else constexpr index_t element_space_unaligned = accumulate_on_sequence( (GetLengths() - Number<1>{}) * GetStrides(), std::plus{}, Number<1>{}); -#endif return align.Get() * ((element_space_unaligned + align.Get() - 1) / align.Get()); } @@ -150,10 +129,7 @@ struct ConstantTensorDescriptor constexpr auto multi_id = Sequence{}; - constexpr auto seq_tmp = - transform_sequences(std::multiplies{}, multi_id, GetStrides()); - - return accumulate_on_sequence(seq_tmp, std::plus{}, Number<0>{}); + return accumulate_on_sequence(multi_id * GetStrides(), std::plus{}, Number<0>{}); } __host__ __device__ static Array GetMultiIndex(index_t id) @@ -177,14 +153,14 @@ struct ConstantTensorDescriptor return ConstantTensorDescriptor{}; } - template + template __host__ __device__ static constexpr auto Extract(Number... extract_dims) { static_assert(sizeof...(IDims) <= GetNumOfDimension(), "wrong! too many number of dimensions to be extracted"); - return make_ConstantTensorDescriptor(Lengths{}.Extract(extract_dims), - Strides{}.Extract(extract_dims)); + return make_ConstantTensorDescriptor(Lengths{}.Extract(extract_dims...), + Strides{}.Extract(extract_dims...)); } template @@ -195,11 +171,11 @@ struct ConstantTensorDescriptor } template - __host__ device__ static constexpr auto Fold(Number, Number...) + __host__ __device__ static constexpr auto Fold(Number, Number...) { constexpr auto fold_intervals = Sequence{}; - constexpr fold_intervals_product = + constexpr index_t fold_intervals_product = accumulate_on_sequence(fold_intervals, std::multiplies{}, Number<1>{}); constexpr auto unfold_length = GetLength(Number{}); @@ -207,29 +183,31 @@ struct ConstantTensorDescriptor // length of the dimension to be folded needs to be dividable by fold_interval_product, // otherwise, folding is invalid - static_assert(unfold_length % fold_interval_product == 0, + static_assert(unfold_length % fold_intervals_product == 0, "wrong! length on the dimension to be folded cannot be evenly divided!"); // folded lengths constexpr auto fold_lengths = - Sequence{}.Append(fold_intervals); + Sequence{}.Append(fold_intervals); // folded strides - constexpr auto fold_strides = transform_sequences(mod_conv::scales{}, + constexpr auto fold_strides = + Number{} * reverse_scan_sequence(fold_intervals.PushBack(Number<1>{}), std::multiplies{}); // left and right lengths - constexpr auto lengths_pair = GetLengths().Split(Number{}); + constexpr auto lengths_pair = GetLengths().Split(Number{}); constexpr auto left_lengths = lengths_pair.first; constexpr auto right_lengths = lengths_pair.second.PopFront(); // left and right strides - constexpr auto strides_pair = GetStrides().Split(Number{}); + constexpr auto strides_pair = GetStrides().Split(Number{}); constexpr auto left_strides = strides_pair.first; constexpr auto right_strides = strides_pair.second.PopFront(); - return make_ConstantTensorDescriptor(left_lengths.Append(fold_lengths).Append(right_lengths), - left_strides.Append(fold_strides).Append(right_strides)); + return make_ConstantTensorDescriptor( + left_lengths.Append(fold_lengths).Append(right_lengths), + left_strides.Append(fold_strides).Append(right_strides)); } template @@ -264,8 +242,8 @@ struct ConstantTensorDescriptor constexpr index_t unfold_length = accumulate_on_sequence(fold_lengths, std::multiplies{}, Number<1>{}); - constexpr auto new_strides = - left_strides.PopBack(Number{}).Append(right_strides); + constexpr auto new_lengths = + left_lengths.PopBack(Number{}).Append(right_lengths); // strides constexpr auto strides_pair1 = Strides{}.Split(Number{}); @@ -281,7 +259,7 @@ struct ConstantTensorDescriptor constexpr index_t unfold_stride = fold_strides.Back(); constexpr auto new_strides = - left_strides.PushBack(Number{}).Append(right_strides); + left_strides.PushBack(Number{}).Append(right_strides); return make_ConstantTensorDescriptor(new_lengths, new_strides); } @@ -289,7 +267,7 @@ struct ConstantTensorDescriptor template __host__ __device__ static constexpr auto ReorderGivenNew2Old(Sequence /*new2old*/) { - static_assert(sizeof...(IRs) == GetNumberOfDimension(), "wrong! dimension is wrong"); + static_assert(sizeof...(IRs) == GetNumOfDimension(), "wrong! dimension is wrong"); constexpr auto map_new2old = Sequence{}; return make_ConstantTensorDescriptor(Lengths{}.ReorderGivenNew2Old(map_new2old), Strides{}.ReorderGivenNew2Old(map_new2old)); diff --git a/src/include/Sequence.hip.hpp b/src/include/Sequence.hip.hpp index 6b87885780..ad9010fc0f 100644 --- a/src/include/Sequence.hip.hpp +++ b/src/include/Sequence.hip.hpp @@ -2,14 +2,7 @@ #include "constant_integral.hip.hpp" #include "functional.hip.hpp" -struct EmptySequence -{ - template - __host__ __device__ constexpr Seq Append(Seq) const - { - return {}; - } -}; +struct EmptySequence; template struct Sequence @@ -73,18 +66,18 @@ struct Sequence __host__ __device__ constexpr auto PopBack() const; - template + template __host__ __device__ constexpr auto Append(Sequence) const { return Sequence{}; } - __host__ __device__ constexpr auto Append(EmptySequence) const { return Type{}; } + __host__ __device__ constexpr auto Append(EmptySequence) const; template __host__ __device__ constexpr auto Extract(Number...) const { - return Sequence)...>{}; + return Sequence{})...>{}; } template @@ -93,8 +86,8 @@ struct Sequence template __host__ __device__ constexpr auto operator()(FirstSeq, SecondSeq) const { - constexpr new_first = FirstSeq{}.PushBack(Number{}); - constexpr new_second = SecondSeq{}.PopFront(); + constexpr index_t new_first = FirstSeq{}.PushBack(Number{}); + constexpr index_t new_second = SecondSeq{}.PopFront(); static_if<(N > 0)>{}([&](auto fwd) { return split_impl{}(new_first, fwd(new_second)); @@ -102,26 +95,10 @@ struct Sequence } }; - // split one sequence to two sequnces: [0, I) and [I, nSize) + // split one sequence to two sequnces: [0, I) and [I, mSize) // return type is std::pair template - __host__ __device__ constexpr auto Split(Number) const - { - static_assert(I <= nSize, "wrong! split position is too high!"); - - static_if<(I == 0)>{}( - [&](auto fwd) { return std::make_pair(EmptySequence<>{}, fwd(Type{})); }); - - static_if<(I == nSize)>{}( - [&](auto fwd) { return std::make_pair(Type<>{}, fwd(EmptySequence<>{})); }); - - static_if<(I > 0 && I < nSize)>{}([&](auto fforwader) { - constexpr auto first = Sequence {} - constexpr auto second = Type{}.PopFront(); - - return split_impl{}(first, fwd(second)); - }); - } + __host__ __device__ constexpr auto Split(Number) const; template __host__ __device__ constexpr auto Modify(Number, Number) const @@ -135,6 +112,53 @@ struct Sequence } }; +struct EmptySequence +{ + __host__ __device__ static constexpr index_t GetSize() { return 0; } + + template + __host__ __device__ constexpr auto PushFront(Number) const + { + return Sequence{}; + } + + template + __host__ __device__ constexpr auto PushBack(Number) const + { + return Sequence{}; + } + + template + __host__ __device__ constexpr Seq Append(Seq) const + { + return Seq{}; + } +}; + +template +__host__ __device__ constexpr auto Sequence::Append(EmptySequence) const +{ + return Type{}; +} + +// split one sequence to two sequnces: [0, I) and [I, mSize) +// return type is std::pair +template +template +__host__ __device__ constexpr auto Sequence::Split(Number) const +{ + static_assert(I <= GetSize(), "wrong! split position is too high!"); + + static_if<(I == 0)>{}([&](auto fwd) { return std::make_pair(EmptySequence{}, fwd(Type{})); }); + + static_if<(I == GetSize())>{}( + [&](auto fwd) { return std::make_pair(Type{}, fwd(EmptySequence{})); }); + + static_if<(I > 0 && I < GetSize())>{}( + [&](auto fwd) { return split_impl{}(EmptySequence{}, fwd(Type{})); }); +} + +#if 0 template __host__ __device__ auto make_increasing_sequence(Number, Number, Number) { @@ -142,15 +166,10 @@ __host__ __device__ auto make_increasing_sequence(Number, Number, // not implemented } - -template -__host__ __device__ auto make_uniform_sequence(Number, Number); -{ - // not implemented -} +#endif template -__host__ __device__ constexpr auto operator+(Sequence, Sequence) const +__host__ __device__ constexpr auto operator+(Sequence, Sequence) { static_assert(sizeof...(Xs) == sizeof...(Ys), "wrong! inconsistent size"); @@ -158,17 +177,18 @@ __host__ __device__ constexpr auto operator+(Sequence, Sequence) c } template -__host__ __device__ constexpr auto operator-(Sequence seq_x, Sequence seq_y) const +__host__ __device__ constexpr auto operator-(Sequence seq_x, Sequence seq_y) { static_assert(sizeof...(Xs) == sizeof...(Ys), "wrong! inconsistent size"); - static_for<0, xs.GetSize(), 1>{}([&](auto I) { static_assert(seq_x.Get(I) >= seq_y.Get(I)); }); + static_for<0, seq_x.GetSize(), 1>{}( + [&](auto I) { static_assert(seq_x.Get(I) >= seq_y.Get(I), "wrong! going to undeflow"); }); return Sequence<(Xs - Ys)...>{}; } template -__host__ __device__ constexpr auto operator*(Sequence, Sequence)const +__host__ __device__ constexpr auto operator*(Sequence, Sequence) { static_assert(sizeof...(Xs) == sizeof...(Ys), "wrong! inconsistent size"); @@ -176,7 +196,7 @@ __host__ __device__ constexpr auto operator*(Sequence, Sequence)co } template -__host__ __device__ constexpr auto operator/(Sequence, Sequence) const +__host__ __device__ constexpr auto operator/(Sequence, Sequence) { static_assert(sizeof...(Xs) == sizeof...(Ys), "wrong! inconsistent size"); @@ -184,15 +204,7 @@ __host__ __device__ constexpr auto operator/(Sequence, Sequence) c } template -__host__ __device__ constexpr auto operator%(Sequence, Sequence) const -{ - static_assert(sizeof...(Xs) == sizeof...(Ys), "wrong! inconsistent size"); - - return Sequence<(Xs % Ys)...>{}; -} - -template -__host__ __device__ constexpr auto operator%(Sequence, Sequence) const +__host__ __device__ constexpr auto operator%(Sequence, Sequence) { static_assert(sizeof...(Xs) == sizeof...(Ys), "wrong! inconsistent size"); @@ -200,63 +212,79 @@ __host__ __device__ constexpr auto operator%(Sequence, Sequence) c } template -__host__ __device__ constexpr auto operator+(Sequence, Number) const +__host__ __device__ constexpr auto operator+(Sequence, Number) { - return seq_x + make_uniform_sequence(Number, Number{}); + return Sequence<(Xs + Y)...>{}; } template -__host__ __device__ constexpr auto operator-(Sequence, Number) const +__host__ __device__ constexpr auto operator-(Sequence, Number) { - return seq_x - make_uniform_sequence(Number, Number{}); + constexpr auto seq_x = Sequence{}; + +#if 0 + static_for<0, sizeof...(Xs), 1>{}([&](auto Iter) { + constexpr auto I = decltype(Iter){}; + static_assert(seq_x.Get(I) >= Y, "wrong! going to underflow"); + }); +#endif + + return Sequence<(Xs - Y)...>{}; } template -__host__ __device__ constexpr auto operator*(Sequence, Number)const +__host__ __device__ constexpr auto operator*(Sequence, Number) { - return seq_x * make_uniform_sequence(Number, Number{}); + return Sequence<(Xs * Y)...>{}; } template -__host__ __device__ constexpr auto operator/(Sequence, Number) const +__host__ __device__ constexpr auto operator/(Sequence, Number) { - return seq_x / make_uniform_sequence(Number, Number{}); + return Sequence<(Xs / Y)...>{}; } template -__host__ __device__ constexpr auto operator%(Sequence seq_x, Number y) const +__host__ __device__ constexpr auto operator%(Sequence, Number) { - return seq_x % make_uniform_sequence(Number, Number{}); + return Sequence<(Xs % Y)...>{}; } -template -__host__ __device__ constexpr auto operator+(Number, Sequence) const +template +__host__ __device__ constexpr auto operator+(Number, Sequence) { - return make_uniform_sequence(Number{}, Number{}) + Sequence{}; + return Sequence<(Y + Xs)...>{}; } -template -__host__ __device__ constexpr auto operator-(Number, Sequence) const +template +__host__ __device__ constexpr auto operator-(Number, Sequence) { - return make_uniform_sequence(Number{}, Number{}) - Sequence{}; + constexpr auto seq_x = Sequence{}; + + static_for<0, sizeof...(Xs), 1>{}([&](auto Iter) { + constexpr auto I = decltype(Iter){}; + static_assert(seq_x.Get(I) <= Y, "wrong! going to underflow"); + }); + + return Sequence<(Y - Xs)...>{}; } -template -__host__ __device__ constexpr auto operator*(Number, Sequence)const +template +__host__ __device__ constexpr auto operator*(Number, Sequence) { - return make_uniform_sequence(Number{}, Number{}) * Sequence{}; + return Sequence<(Y * Xs)...>{}; } -template -__host__ __device__ constexpr auto operator/(Number, Sequence) const +template +__host__ __device__ constexpr auto operator/(Number, Sequence) { - return make_uniform_sequence(Number{}, Number{}) / Sequence{}; + return Sequence<(Y / Xs)...>{}; } -template -__host__ __device__ constexpr auto operator%(Number, Sequence) const +template +__host__ __device__ constexpr auto operator%(Number, Sequence) { - return make_uniform_sequence(Number{}, Number{}) % Sequence{}; + return Sequence<(Y % Xs)...>{}; } template @@ -268,7 +296,7 @@ __host__ __device__ constexpr auto sequence_pop_front(Sequence) #if 0 // TODO: for some reason, compiler cannot instantiate this template -template +template __host__ __device__ constexpr auto sequence_pop_back(Sequence) { static_assert(sizeof...(Is) > 0, "empty Sequence!"); @@ -356,8 +384,6 @@ __host__ __device__ constexpr auto } #endif -#if 1 -// TODO: fix these mess template __host__ __device__ constexpr auto transform_sequences(F f, Sequence) { @@ -382,45 +408,6 @@ transform_sequences(F f, Sequence, Sequence, Sequence) return Sequence{}; } -#else -// TODO:: these doesn't compile -template -struct transform_sequences_impl -{ - template - __host__ __device__ constexpr auto operator()(F f, Y y, Xs... xs) const - { - static_assert(NRemain > 1, "wrong! should have NRemain > 1"); - - constexpr index_t N = f(Xs{}.Get(Number<0>{})...); - constexpr auto y_new = y.PushBack(Number{}); - - return transform_sequences_impl{}(f, y_new, xs.PopFront()...); - } -}; - -template <> -struct transform_sequences_impl<1> -{ - template - __host__ __device__ constexpr auto operator()(F f, Y, Xs...) const - { - constexpr index_t N = f(Xs{}.Get(Number<0>{})...); - return Y{}.PushBack(Number{}); - } -}; - -template -__host__ __device__ constexpr auto transform_sequences(F f, X x, Xs... xs) -{ - constexpr index_t nSize = X::GetSize(); - constexpr auto I0 = Number<0>{}; - - constexpr auto y0 = Sequence{}; - - return transform_sequences_impl{}(f, y0, x.PopFront(), xs.PopFront()...); -} -#endif template __host__ __device__ constexpr auto Sequence::PopFront() const diff --git a/src/include/blockwise_gemm.hip.hpp b/src/include/blockwise_gemm.hip.hpp index 722c1ae9bb..3159eb5ae2 100644 --- a/src/include/blockwise_gemm.hip.hpp +++ b/src/include/blockwise_gemm.hip.hpp @@ -7,6 +7,7 @@ template {}, thread_sub_tensor_lengths, SrcClusterLengths{}); + constexpr auto src_data_per_cluster_per_dims = + thread_sub_tensor_lengths * SrcClusterLengths{}; constexpr auto repeat_lengths = transform_sequences(mod_conv::integer_divide_ceiler{}, SrcLengths{}, src_data_per_cluster_per_dims); - constexpr auto thread_tensor_lengths = transform_sequences( - std::multiplies{}, thread_sub_tensor_lengths, repeat_lengths); + constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths; constexpr auto thread_tensor_desc = make_ConstantTensorDescriptor(thread_tensor_lengths); @@ -153,27 +152,24 @@ struct BlockwiseTensorSliceReorderCopy_v3 { constexpr auto thread_sub_tensor_lengths = SrcSubLengths{}; - constexpr auto src_data_per_cluster_per_dims = transform_sequences( - std::multiplies{}, thread_sub_tensor_lengths, SrcClusterLengths{}); + constexpr auto src_data_per_cluster_per_dims = + thread_sub_tensor_lengths * SrcClusterLengths{}; constexpr auto repeat_lengths = transform_sequences(mod_conv::integer_divide_ceiler{}, SrcLengths{}, src_data_per_cluster_per_dims); - constexpr auto thread_tensor_lengths = transform_sequences( - std::multiplies{}, thread_sub_tensor_lengths, repeat_lengths); + constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths; constexpr auto thread_tensor_desc = make_ConstantTensorDescriptor(thread_tensor_lengths); static_ford{}([&](auto repeat_multi_id_) { constexpr auto repeat_multi_id = decltype(repeat_multi_id_){}; - constexpr auto src_data_multi_id = transform_sequences( - std::multiplies{}, repeat_multi_id, src_data_per_cluster_per_dims); + constexpr auto src_data_multi_id = repeat_multi_id * src_data_per_cluster_per_dims; - constexpr auto clipboard_data_multi_id = transform_sequences( - std::multiplies{}, repeat_multi_id, thread_sub_tensor_lengths); + constexpr auto clipboard_data_multi_id = repeat_multi_id * thread_sub_tensor_lengths; constexpr index_t src_offset = SrcDesc{}.Get1dIndex(src_data_multi_id); constexpr index_t clipboard_offset = @@ -193,27 +189,24 @@ struct BlockwiseTensorSliceReorderCopy_v3 { constexpr auto thread_sub_tensor_lengths = SrcSubLengths{}; - constexpr auto src_data_per_cluster_per_dims = transform_sequences( - std::multiplies{}, thread_sub_tensor_lengths, SrcClusterLengths{}); + constexpr auto src_data_per_cluster_per_dims = + thread_sub_tensor_lengths * SrcClusterLengths{}; constexpr auto repeat_lengths = transform_sequences(mod_conv::integer_divide_ceiler{}, SrcLengths{}, src_data_per_cluster_per_dims); - constexpr auto thread_tensor_lengths = transform_sequences( - std::multiplies{}, thread_sub_tensor_lengths, repeat_lengths); + constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths; constexpr auto thread_tensor_desc = make_ConstantTensorDescriptor(thread_tensor_lengths); static_ford{}([&](auto repeat_multi_id_) { constexpr auto repeat_multi_id = decltype(repeat_multi_id_){}; - constexpr auto clipboard_data_multi_id = transform_sequences( - std::multiplies{}, repeat_multi_id, thread_sub_tensor_lengths); + constexpr auto clipboard_data_multi_id = repeat_multi_id * thread_sub_tensor_lengths; - constexpr auto src_data_multi_id = transform_sequences( - std::multiplies{}, repeat_multi_id, src_data_per_cluster_per_dims); + constexpr auto src_data_multi_id = repeat_multi_id * src_data_per_cluster_per_dims; // reorder src_data_multi_id to get dst_data_multi_id constexpr auto dst_data_multi_id = src_data_multi_id.ReorderGivenNew2Old(MapDst2Src{}); diff --git a/src/include/functional.hip.hpp b/src/include/functional.hip.hpp index 79b5b22cc3..e25dffa8c7 100644 --- a/src/include/functional.hip.hpp +++ b/src/include/functional.hip.hpp @@ -37,7 +37,8 @@ struct static_if { // This is a trick for compiler: // Pass forwarder to lambda "f" as "auto" argument, and maks sure "f" will use it, - // this will make "f" a generic lambda, so that "f" won't be compiled until here + // this will make "f" a generic lambda, so that "f" won't be compiled until being + // instantiated here f(forwarder{}); return Type{}; } @@ -65,7 +66,8 @@ struct static_if { // This is a trick for compiler: // Pass forwarder to lambda "f" as "auto" argument, and maks sure "f" will use it, - // this will make "f" a generic lambda, so that "f" won't be compiled until here + // this will make "f" a generic lambda, so that "f" won't be compiled until being + // instantiated here f(forwarder{}); return Type{}; } @@ -105,7 +107,7 @@ struct static_for static_assert((NEnd - NBegin) % Increment == 0, "Wrong! should satisfy (NEnd - NBegin) % Increment == 0"); - static_if<(NBegin < End)>{}( + static_if<(NBegin < NEnd)>{}( [&](auto fwd) { static_for_impl{}(f); }); } }; diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp index 3e2ea73bd9..1cf033bef5 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp @@ -201,7 +201,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn // choose GEMM implementation here const auto run_blockwise_batch_gemm = [&](auto... Xs) { -#if 0 +#if 1 return blockwise_batch_gemm.Run(Xs...); #elif 0 return blockwise_batch_gemm.Run_asm(Xs...); diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp index e33b25d429..45933c6bc2 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp @@ -142,7 +142,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn decltype(map_chwn2nchw), InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW, InBlockReorderDataPerRead_W, - InBlockReorderDataPerWrite_N>{}; + InBlockReorderDataPerWrite_N>({0, 0, 0, 0}, {0, 0, 0, 0}); // blockwise wei copy // format is [CPerBlock, KPerBlock] @@ -196,7 +196,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn // choose GEMM implementation here const auto run_blockwise_batch_gemm = [&](auto... Xs) { -#if 0 +#if 1 return blockwise_batch_gemm.Run(Xs...); #elif 0 return blockwise_batch_gemm.Run_asm(Xs...); diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp index 605496d1c8..33673dbaa4 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp @@ -142,7 +142,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw decltype(map_chwn2nchw), InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW, InBlockReorderDataPerRead_W, - InBlockReorderDataPerWrite_N>{}; + InBlockReorderDataPerWrite_N>({0, 0, 0, 0}, {0, 0, 0, 0}); // blockwise wei copy // format is [CPerBlock, KPerBlock] @@ -196,7 +196,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw // choose GEMM implementation here const auto run_blockwise_batch_gemm = [&](auto... Xs) { -#if 0 +#if 1 return blockwise_batch_gemm.Run(Xs...); #elif 0 return blockwise_batch_gemm.Run_asm(Xs...);