diff --git a/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp b/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp index d74b41ff22..919c3503f5 100644 --- a/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp +++ b/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp @@ -38,7 +38,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, constexpr index_t X = wei_kcyx_desc.GetLength(I3); // reorder weight - auto wei_cyxk_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence{}); + auto wei_cyxk_desc = make_ConstantTensorDescriptor_packed(Sequence{}); ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: "); Tensor wei_cyxk(make_TensorDescriptor(wei_cyxk_desc)); @@ -51,7 +51,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, std::thread::hardware_concurrency()); // reorder input - auto in_chwn_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence{}); + auto in_chwn_desc = make_ConstantTensorDescriptor_packed(Sequence{}); ostream_ConstantTensorDescriptor(in_chwn_desc, std::cout << "in_chwn_desc: "); Tensor in_chwn(make_TensorDescriptor(in_chwn_desc)); @@ -64,8 +64,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, std::thread::hardware_concurrency()); // output - auto out_khwn_desc = - make_ConstantTensorDescriptor_default_rank_packed(Sequence{}); + auto out_khwn_desc = make_ConstantTensorDescriptor_packed(Sequence{}); ostream_ConstantTensorDescriptor(out_khwn_desc, std::cout << "out_khwn_desc: "); Tensor out_khwn(make_TensorDescriptor(out_khwn_desc)); diff --git a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp index a4381b370b..3237a7310b 100644 --- a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp +++ b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp @@ -37,7 +37,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc, constexpr index_t X = wei_kcyx_desc.GetLength(I3); // reorder weight - auto wei_cyxk_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence{}); + auto wei_cyxk_desc = make_ConstantTensorDescriptor_packed(Sequence{}); ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: "); Tensor wei_cyxk(make_TensorDescriptor(wei_cyxk_desc)); @@ -50,8 +50,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc, std::thread::hardware_concurrency()); // output - auto out_khwn_desc = - make_ConstantTensorDescriptor_default_rank_packed(Sequence{}); + auto out_khwn_desc = make_ConstantTensorDescriptor_packed(Sequence{}); ostream_ConstantTensorDescriptor(out_khwn_desc, std::cout << "out_khwn_desc: "); Tensor out_khwn(make_TensorDescriptor(out_khwn_desc)); diff --git a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp index b89a6d9bd3..acd8176023 100644 --- a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp +++ b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp @@ -36,7 +36,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc, constexpr index_t X = wei_kcyx_desc.GetLength(I3); // reorder weight - auto wei_cyxk_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence{}); + auto wei_cyxk_desc = make_ConstantTensorDescriptor_packed(Sequence{}); ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: "); Tensor wei_cyxk(make_TensorDescriptor(wei_cyxk_desc)); diff --git a/driver/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp b/driver/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp index 7e949306ef..b3b6d785bf 100644 --- a/driver/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp +++ b/driver/device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp @@ -36,7 +36,7 @@ void device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(InDesc, constexpr index_t X = wei_kcyx_desc.GetLength(I3); // reorder weight - auto wei_cyxk_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence{}); + auto wei_cyxk_desc = make_ConstantTensorDescriptor_packed(Sequence{}); ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: "); Tensor wei_cyxk(make_TensorDescriptor(wei_cyxk_desc)); diff --git a/driver/driver.hip.cpp b/driver/driver.hip.cpp index 928759eef4..9022bcdf2a 100644 --- a/driver/driver.hip.cpp +++ b/driver/driver.hip.cpp @@ -443,7 +443,7 @@ int main(int argc, char* argv[]) constexpr index_t HPad = 0; constexpr index_t WPad = 0; -#elif 0 +#elif 1 // 3x3 filter, 28x28 image constexpr index_t N = 128; constexpr index_t C = 256; @@ -455,7 +455,7 @@ int main(int argc, char* argv[]) constexpr index_t HPad = 0; constexpr index_t WPad = 0; -#elif 1 +#elif 0 // 1x1 filter, 28x28 image constexpr index_t N = 128; constexpr index_t C = 512; @@ -568,8 +568,8 @@ int main(int argc, char* argv[]) auto lower_pads = Sequence{}; auto upper_pads = Sequence{}; - auto in_nchw_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence{}); - auto wei_kcyx_desc = make_ConstantTensorDescriptor_default_rank_packed(Sequence{}); + auto in_nchw_desc = make_ConstantTensorDescriptor_packed(Sequence{}); + auto wei_kcyx_desc = make_ConstantTensorDescriptor_packed(Sequence{}); auto out_nkhw_desc = get_convolution_with_padding_output_default_4d_tensor_descriptor( in_nchw_desc, wei_kcyx_desc, lower_pads, upper_pads); diff --git a/src/include/ConstantMergedTensorDescriptor.hip.hpp b/src/include/ConstantMergedTensorDescriptor.hip.hpp index e8fb88d9b2..157546c979 100644 --- a/src/include/ConstantMergedTensorDescriptor.hip.hpp +++ b/src/include/ConstantMergedTensorDescriptor.hip.hpp @@ -114,7 +114,7 @@ struct ConstantMergedTensorDescriptor __host__ __device__ static Array GetMultiIndexFrom1dIndex(index_t id) { - constexpr auto dummy_desc = make_ConstantTensorDescriptor_default_rank_packed(GetLengths()); + constexpr auto dummy_desc = make_ConstantTensorDescriptor_packed(GetLengths()); return dummy_desc.GetMultiIndexFrom1dIndex(id); } @@ -128,7 +128,7 @@ __host__ __device__ constexpr auto make_ConstantMergedTensorDescriptor(OriginalT } template -__host__ __device__ void print_ConstantMergedTensorDescriptor(TDesc, const char* s) +__host__ __device__ void print_ConstantMergedTensorDescriptor(const char* s, TDesc) { - print_ConstantTensorDescriptor(TDesc::GetOriginalTensorDescriptor(), s); + print_ConstantTensorDescriptor(s, TDesc::GetOriginalTensorDescriptor()); } diff --git a/src/include/ConstantTensorDescriptor.hip.hpp b/src/include/ConstantTensorDescriptor.hip.hpp index 63e17b35ac..788d197351 100644 --- a/src/include/ConstantTensorDescriptor.hip.hpp +++ b/src/include/ConstantTensorDescriptor.hip.hpp @@ -2,25 +2,23 @@ #include "common.hip.hpp" template -__host__ __device__ constexpr auto calculate_tensor_strides_default_rank_packed(Lengths) +__host__ __device__ constexpr auto calculate_tensor_strides_packed(Lengths) { return reverse_inclusive_scan_sequence(Lengths{}.PopFront(), mod_conv::multiplies{}) .PushBack(Number<1>{}); } template -__host__ __device__ constexpr auto calculate_tensor_strides_default_rank_aligned(Lengths, - Number) +__host__ __device__ constexpr auto calculate_tensor_strides_aligned(Lengths, Number) { constexpr index_t L_back_align = Align * mod_conv::integer_divide_ceiler{}(Lengths{}.Back(), Align); - return calculate_tensor_strides_default_rank_packed( + return calculate_tensor_strides_packed( Lengths{}.Modify(Number{}, Number{})); } -// MemoryRanks of dimensions is for conversion from offset to multi-index -template +template struct ConstantTensorDescriptor { using Type = ConstantTensorDescriptor; @@ -29,15 +27,7 @@ struct ConstantTensorDescriptor __host__ __device__ constexpr ConstantTensorDescriptor() { - static_assert(Lengths::GetSize() == Strides::GetSize() && - Lengths::GetSize() == MemoryRanks::GetSize(), - "nDim not consistent"); - -#if 0 // require sequence_sort, but it's not implemented yet - static_assert(is_same::SortedSeqType, - typename arithmetic_sequence_gen<0, nDim, 1>::SeqType>::value, - "wrong! invalid MemoryRanks"); -#endif + static_assert(Lengths::GetSize() == Strides::GetSize(), "nDim not consistent"); } __host__ __device__ static constexpr auto GetOriginalTensorDescriptor() { return Type{}; } @@ -54,8 +44,6 @@ struct ConstantTensorDescriptor __host__ __device__ static constexpr auto GetStrides() { return Strides{}; } - __host__ __device__ static constexpr auto GetMemoryRanks() { return MemoryRanks{}; } - template __host__ __device__ static constexpr index_t GetLength(Number) { @@ -68,12 +56,6 @@ struct ConstantTensorDescriptor return Strides{}.Get(Number{}); } - template - __host__ __device__ static constexpr index_t GetMemoryRank(Number) - { - return MemoryRanks{}.Get(Number{}); - } - __host__ __device__ static constexpr bool AreStridesNonAscending() { bool flag = true; @@ -98,20 +80,13 @@ struct ConstantTensorDescriptor return accumulate_on_sequence(Lengths{}, mod_conv::multiplies{}, Number<1>{}); } - // WRONG! ReorderGivenOld2New is broken template > __host__ __device__ static constexpr index_t GetElementSpace(Align align = Align{}) { -#if 0 - constexpr auto lengths_in_rank = GetLengths().ReorderGivenOld2New(MemoryRank{}); - constexpr auto strides_in_rank = GetStrides().ReorderGivenOld2new(MemoryRank{}); - - constexpr index_t element_space_unaligned = accumulate_on_sequence( - (lengths_in_rank - Number<1>{}) * strides_in_rank, mod_conv::plus{}, Number<1>{}); -#else // WRONG! align shouldbe applied to the last memory rank, not the last tensor dimension + // This is WRONG! align shouldbe applied to the last memory rank, not the last tensor + // dimension constexpr index_t element_space_unaligned = accumulate_on_sequence( (GetLengths() - Number<1>{}) * GetStrides(), mod_conv::plus{}, Number<1>{}); -#endif return align.Get() * ((element_space_unaligned + align.Get() - 1) / align.Get()); } @@ -148,35 +123,13 @@ struct ConstantTensorDescriptor multi_id * GetStrides(), mod_conv::plus{}, Number<0>{}); } -#if 0 // ReorderGivenOld2new is broken - __host__ __device__ static Array GetMultiIndexFromOffset(index_t offset) - { - Array ranked_multi_id; - - constexpr auto ranked_strides = - GetStrides().ReorderGivenOld2new(MemoryRanks{}); // check this - - // calculate index in each of the dimensions in the order of their rank (not dimension) - static_for<0, nDim - 1, 1>{}([&](auto IDim) { - constexpr index_t idim = IDim.Get(); - constexpr index_t stride = ranked_strides.Get(Number{}); - ranked_multi_id[idim] = offset / stride; - offset -= ranked_multi_id[idim] * stride; - }); - - ranked_multi_id[nDim - 1] = offset / ranked_strides.Get(Number{}); - - return reorder_array_given_new2old(ranked_multi_id, MemoryRanks{}); // check this - } -#endif - __host__ __device__ static Array GetMultiIndexFrom1dIndex(index_t id) { Array multi_id; - constexpr auto dummy_strides = calculate_tensor_strides_default_rank_packed(GetLengths()); + constexpr auto dummy_strides = calculate_tensor_strides_packed(GetLengths()); - // calculate index in each of the dimensions in the order of their dimension (not rank) + // calculate index in each of the dimensions in the order of their dimension static_for<0, nDim - 1, 1>{}([&](auto IDim) { constexpr index_t idim = IDim.Get(); constexpr index_t stride = dummy_strides.Get(Number{}); @@ -267,24 +220,16 @@ struct ConstantTensorDescriptor return new_multi_id; } - // WRONG! Ranks is broken template __host__ __device__ static constexpr auto Extract(Number... extract_dims) { static_assert(sizeof...(IDims) <= GetNumOfDimension(), "wrong! too many number of dimensions to be extracted"); - using extract_lengths = decltype(Lengths{}.Extract(extract_dims...)); - using extract_strides = decltype(Strides{}.Extract(extract_dims...)); - using extract_ranks = decltype(MemoryRanks{}.Extract(extract_dims...)); + using extract_lengths = decltype(Lengths::Extract(extract_dims...)); + using extract_strides = decltype(Strides::Extract(extract_dims...)); -#if 0 - using new_ranks = typename sequence_sort::Original2SortedType; -#else // WRONG! TODO:: implement sequence_sort - using new_ranks = typename arithmetic_sequence_gen<0, sizeof...(IDims), 1>::SeqType; -#endif - - return ConstantTensorDescriptor{}; + return ConstantTensorDescriptor{}; } template @@ -298,12 +243,8 @@ struct ConstantTensorDescriptor { using leaf_tensor = ConstantTensorDescriptor; - // memory rank is broken - // TODO: remove memory rank info from tensor descritpor return ConstantTensorDescriptor{}; + decltype(GetStrides().Append(leaf_tensor::GetStrides()))>{}; } template @@ -311,18 +252,9 @@ struct ConstantTensorDescriptor { using slice_lengths = decltype(Lengths{}.Modify(Number{}, Number{})); - return ConstantTensorDescriptor{}; + return ConstantTensorDescriptor{}; } - template - struct f_fold_impl - { - __host__ __device__ constexpr index_t operator()(index_t x) const - { - return x > Threashold ? x + Delta : x; - } - }; - template __host__ __device__ static constexpr auto Fold(Number, Number...) { @@ -333,7 +265,6 @@ struct ConstantTensorDescriptor constexpr auto unfold_length = GetLength(Number{}); constexpr auto unfold_stride = GetStride(Number{}); - constexpr auto unfold_rank = GetMemoryRank(Number{}); // length of the dimension to be folded needs to be dividable by fold_interval_product, // otherwise, folding is invalid @@ -350,16 +281,6 @@ struct ConstantTensorDescriptor reverse_inclusive_scan_sequence(fold_intervals.PushBack(Number<1>{}), mod_conv::multiplies{}); - // folded_ranks - constexpr auto fold_ranks = - typename arithmetic_sequence_gen::SeqType{}; - - // increase the ranks that are larger than unfold_rank - constexpr auto tmp_ranks = transform_sequences( - f_fold_impl{}, GetMemoryRanks()); - // left and right constexpr auto left = typename arithmetic_sequence_gen<0, IDim, 1>::SeqType{}; constexpr auto right = @@ -369,15 +290,8 @@ struct ConstantTensorDescriptor GetLengths().Extract(left).Append(fold_lengths).Append(GetLengths().Extract(right)); constexpr auto new_strides = GetStrides().Extract(left).Append(fold_strides).Append(GetStrides().Extract(right)); - constexpr auto new_ranks = - tmp_ranks.Extract(left).Append(fold_ranks).Append(tmp_ranks.Extract(right)); - static_assert(new_ranks.GetSize() == new_lengths.GetSize(), "wrong!"); - static_assert(fold_ranks.GetSize() == fold_lengths.GetSize(), "wrong!"); - - return ConstantTensorDescriptor{}; + return ConstantTensorDescriptor{}; } template @@ -411,11 +325,6 @@ struct ConstantTensorDescriptor // check if packed static_assert(GetStride(IDim_p1) * GetLength(IDim_p1) == GetStride(IDim), "wrong! dimensions to be unfolded need to be packed"); - - // check ranks - static_assert(GetMemoryRank(IDim_p1) == GetMemoryRank(IDim) + 1, - "wrong! ranks of dimensions to be unfolded need to be in increasing and " - "continuous ranks"); }); #endif @@ -426,21 +335,13 @@ struct ConstantTensorDescriptor constexpr auto right = typename arithmetic_sequence_gen::SeqType{}; - // unfolded length, stride and rank + // unfolded length, stride constexpr index_t unfold_length = accumulate_on_sequence( GetLengths().Extract(middle), mod_conv::multiplies{}, Number<1>{}); constexpr index_t unfold_stride = GetStride(Number{}); - constexpr index_t unfold_rank = GetMemoryRank(Number{}); - - // decrease the ranks that are larger than the rank of LastUnfoldDim - constexpr auto tmp_ranks = - transform_sequences(f_unfold_impl{}), - LastUnfoldDim - FirstUnfoldDim + 1>{}, - GetMemoryRanks()); - - // new lengths, strides and ranks + // new lengths, strides constexpr auto new_lengths = GetLengths() .Extract(left) .PushBack(Number{}) @@ -451,22 +352,14 @@ struct ConstantTensorDescriptor .PushBack(Number{}) .Append(GetStrides().Extract(right)); - constexpr auto new_ranks = tmp_ranks.Extract(left) - .PushBack(Number{}) - .Append(tmp_ranks.Extract(right)); - - return ConstantTensorDescriptor{}; + return ConstantTensorDescriptor{}; } template __host__ __device__ static constexpr auto ReorderGivenNew2Old(MapNew2Old) { return ConstantTensorDescriptor{}; + decltype(Strides{}.ReorderGivenNew2Old(MapNew2Old{}))>{}; } #if 0 // require sequence_sort, which is not implemented yet @@ -474,358 +367,108 @@ struct ConstantTensorDescriptor __host__ __device__ static constexpr auto ReorderGivenOld2New(MapOld2New) { return ConstantTensorDescriptor{}; + decltype(Strides{}.ReorderGivenOld2New(MapOld2New{}))>{} } #endif }; template -__host__ __device__ constexpr auto make_ConstantTensorDescriptor_default_rank_packed(Lengths) +__host__ __device__ constexpr auto make_ConstantTensorDescriptor_packed(Lengths) { - using Strides = decltype(calculate_tensor_strides_default_rank_packed(Lengths{})); - using MemoryRanks = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::SeqType; - return ConstantTensorDescriptor{}; + using Strides = decltype(calculate_tensor_strides_packed(Lengths{})); + return ConstantTensorDescriptor{}; } template -__host__ __device__ constexpr auto make_ConstantTensorDescriptor_default_rank(Lengths, Strides) +__host__ __device__ constexpr auto make_ConstantTensorDescriptor(Lengths, Strides) { - using MemoryRanks = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::SeqType; - return ConstantTensorDescriptor{}; + return ConstantTensorDescriptor{}; } template -__host__ __device__ constexpr auto make_ConstantTensorDescriptor_default_rank_aligned(Lengths, - Number) +__host__ __device__ constexpr auto make_ConstantTensorDescriptor_aligned(Lengths, Number) { - using Strides = - decltype(calculate_tensor_strides_default_rank_aligned(Lengths{}, Number{})); - using MemoryRanks = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::SeqType; - return ConstantTensorDescriptor{}; + using Strides = decltype(calculate_tensor_strides_aligned(Lengths{}, Number{})); + return ConstantTensorDescriptor{}; } -template -__host__ __device__ void print_ConstantTensorDescriptor(TDesc, const char* s) +template +__host__ __device__ void +print_ConstantTensorDescriptor(const char* s, + ConstantTensorDescriptor, Sequence>) { - constexpr index_t ndim = TDesc::GetNumOfDimension(); + constexpr index_t ndim = sizeof...(Lengths); - static_assert(ndim >= 1 && ndim <= 10, "wrong!"); + static_assert(ndim > 0 && ndim <= 10, "wrong!"); - static_if{}([&](auto fwd) { - constexpr auto I0 = Number<0>{}; - - constexpr auto desc = fwd(TDesc{}); - - printf("%s dim %u, lengths {%u}, strides {%u}, ranks {%u}\n", - s, - desc.GetNumOfDimension(), - desc.GetLength(I0), - desc.GetStride(I0), - desc.GetMemoryRank(I0)); + static_if{}([&](auto) { + printf("%s dim %u, lengths {%u}, strides {%u}\n", s, ndim, Lengths..., Strides...); }); - static_if{}([&](auto fwd) { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - - constexpr auto desc = fwd(TDesc{}); - - printf("%s dim %u, lengths {%u %u}, strides {%u %u}, ranks {%u %u}\n", - s, - desc.GetNumOfDimension(), - desc.GetLength(I0), - desc.GetLength(I1), - desc.GetStride(I0), - desc.GetStride(I1), - desc.GetMemoryRank(I0), - desc.GetMemoryRank(I1)); + static_if{}([&](auto) { + printf("%s dim %u, lengths {%u %u}, strides {%u %u}\n", s, ndim, Lengths..., Strides...); }); - static_if{}([&](auto fwd) { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - - constexpr auto desc = fwd(TDesc{}); - - printf("%s dim %u, lengths {%u %u %u}, strides {%u %u %u}, ranks {%u %u %u}\n", - s, - desc.GetNumOfDimension(), - desc.GetLength(I0), - desc.GetLength(I1), - desc.GetLength(I2), - desc.GetStride(I0), - desc.GetStride(I1), - desc.GetStride(I2), - desc.GetMemoryRank(I0), - desc.GetMemoryRank(I1), - desc.GetMemoryRank(I2)); + static_if{}([&](auto) { + printf( + "%s dim %u, lengths {%u %u %u}, strides {%u %u %u}\n", s, ndim, Lengths..., Strides...); }); - static_if{}([&](auto fwd) { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - - constexpr auto desc = fwd(TDesc{}); - - printf("%s dim %u, lengths {%u %u %u %u}, strides {%u %u %u %u}, ranks {%u %u %u %u}\n", + static_if{}([&](auto) { + printf("%s dim %u, lengths {%u %u %u %u}, strides {%u %u %u %u}\n", s, - desc.GetNumOfDimension(), - desc.GetLength(I0), - desc.GetLength(I1), - desc.GetLength(I2), - desc.GetLength(I3), - desc.GetStride(I0), - desc.GetStride(I1), - desc.GetStride(I2), - desc.GetStride(I3), - desc.GetMemoryRank(I0), - desc.GetMemoryRank(I1), - desc.GetMemoryRank(I2), - desc.GetMemoryRank(I3)); + ndim, + Lengths..., + Strides...); }); - static_if{}([&](auto fwd) { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - constexpr auto I4 = Number<4>{}; + static_if{}([&](auto) { + printf("%s dim %u, lengths {%u %u %u %u %u}, strides {%u %u %u %u %u}\n", + s, + ndim, + Lengths..., + Strides...); + }); - constexpr auto desc = fwd(TDesc{}); + static_if{}([&](auto) { + printf("%s dim %u, lengths {%u %u %u %u %u %u}, strides {%u %u %u %u %u %u}\n", + s, + ndim, + Lengths..., + Strides...); + }); - printf("%s dim %u, lengths {%u %u %u %u %u}, strides {%u %u %u %u %u}, ranks {%u %u %u %u " + static_if{}([&](auto) { + printf("%s dim %u, lengths {%u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u}\n", + s, + ndim, + Lengths..., + Strides...); + }); + + static_if{}([&](auto) { + printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u %u}\n", + s, + ndim, + Lengths..., + Strides...); + }); + + static_if{}([&](auto) { + printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u %u " "%u}\n", s, - desc.GetNumOfDimension(), - desc.GetLength(I0), - desc.GetLength(I1), - desc.GetLength(I2), - desc.GetLength(I3), - desc.GetLength(I4), - desc.GetStride(I0), - desc.GetStride(I1), - desc.GetStride(I2), - desc.GetStride(I3), - desc.GetStride(I4), - desc.GetMemoryRank(I0), - desc.GetMemoryRank(I1), - desc.GetMemoryRank(I2), - desc.GetMemoryRank(I3), - desc.GetMemoryRank(I4)); + ndim, + Lengths..., + Strides...); }); - static_if{}([&](auto fwd) { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - constexpr auto I4 = Number<4>{}; - constexpr auto I5 = Number<5>{}; - - constexpr auto desc = fwd(TDesc{}); - - printf("%s dim %u, lengths {%u %u %u %u %u %u}, strides {%u %u %u %u %u %u}, ranks {%u %u " - "%u %u %u %u}\n", - s, - desc.GetNumOfDimension(), - desc.GetLength(I0), - desc.GetLength(I1), - desc.GetLength(I2), - desc.GetLength(I3), - desc.GetLength(I4), - desc.GetLength(I5), - desc.GetStride(I0), - desc.GetStride(I1), - desc.GetStride(I2), - desc.GetStride(I3), - desc.GetStride(I4), - desc.GetStride(I5), - desc.GetMemoryRank(I0), - desc.GetMemoryRank(I1), - desc.GetMemoryRank(I2), - desc.GetMemoryRank(I3), - desc.GetMemoryRank(I4), - desc.GetMemoryRank(I5)); - }); - - static_if{}([&](auto fwd) { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - constexpr auto I4 = Number<4>{}; - constexpr auto I5 = Number<5>{}; - constexpr auto I6 = Number<6>{}; - - constexpr auto desc = fwd(TDesc{}); - - printf("%s dim %u, lengths {%u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u}, ranks " - "{%u %u %u %u %u %u %u}\n", - s, - desc.GetNumOfDimension(), - desc.GetLength(I0), - desc.GetLength(I1), - desc.GetLength(I2), - desc.GetLength(I3), - desc.GetLength(I4), - desc.GetLength(I5), - desc.GetLength(I6), - desc.GetStride(I0), - desc.GetStride(I1), - desc.GetStride(I2), - desc.GetStride(I3), - desc.GetStride(I4), - desc.GetStride(I5), - desc.GetStride(I6), - desc.GetMemoryRank(I0), - desc.GetMemoryRank(I1), - desc.GetMemoryRank(I2), - desc.GetMemoryRank(I3), - desc.GetMemoryRank(I4), - desc.GetMemoryRank(I5), - desc.GetMemoryRank(I6)); - }); - - static_if{}([&](auto fwd) { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - constexpr auto I4 = Number<4>{}; - constexpr auto I5 = Number<5>{}; - constexpr auto I6 = Number<6>{}; - constexpr auto I7 = Number<7>{}; - - constexpr auto desc = fwd(TDesc{}); - - printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u %u}, " - "ranks {%u %u %u %u %u %u %u %u}\n", - s, - desc.GetNumOfDimension(), - desc.GetLength(I0), - desc.GetLength(I1), - desc.GetLength(I2), - desc.GetLength(I3), - desc.GetLength(I4), - desc.GetLength(I5), - desc.GetLength(I6), - desc.GetLength(I7), - desc.GetStride(I0), - desc.GetStride(I1), - desc.GetStride(I2), - desc.GetStride(I3), - desc.GetStride(I4), - desc.GetStride(I5), - desc.GetStride(I6), - desc.GetStride(I7), - desc.GetMemoryRank(I0), - desc.GetMemoryRank(I1), - desc.GetMemoryRank(I2), - desc.GetMemoryRank(I3), - desc.GetMemoryRank(I4), - desc.GetMemoryRank(I5), - desc.GetMemoryRank(I6), - desc.GetMemoryRank(I7)); - }); - - static_if{}([&](auto fwd) { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - constexpr auto I4 = Number<4>{}; - constexpr auto I5 = Number<5>{}; - constexpr auto I6 = Number<6>{}; - constexpr auto I7 = Number<7>{}; - constexpr auto I8 = Number<8>{}; - - constexpr auto desc = fwd(TDesc{}); - - printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u %u " - "%u}, ranks {%u %u %u %u %u %u %u %u %u}\n", - s, - desc.GetNumOfDimension(), - desc.GetLength(I0), - desc.GetLength(I1), - desc.GetLength(I2), - desc.GetLength(I3), - desc.GetLength(I4), - desc.GetLength(I5), - desc.GetLength(I6), - desc.GetLength(I7), - desc.GetLength(I8), - desc.GetStride(I0), - desc.GetStride(I1), - desc.GetStride(I2), - desc.GetStride(I3), - desc.GetStride(I4), - desc.GetStride(I5), - desc.GetStride(I6), - desc.GetStride(I7), - desc.GetStride(I8), - desc.GetMemoryRank(I0), - desc.GetMemoryRank(I1), - desc.GetMemoryRank(I2), - desc.GetMemoryRank(I3), - desc.GetMemoryRank(I4), - desc.GetMemoryRank(I5), - desc.GetMemoryRank(I6), - desc.GetMemoryRank(I7), - desc.GetMemoryRank(I8)); - }); - - static_if{}([&](auto fwd) { - constexpr auto I0 = Number<0>{}; - constexpr auto I1 = Number<1>{}; - constexpr auto I2 = Number<2>{}; - constexpr auto I3 = Number<3>{}; - constexpr auto I4 = Number<4>{}; - constexpr auto I5 = Number<5>{}; - constexpr auto I6 = Number<6>{}; - constexpr auto I7 = Number<7>{}; - constexpr auto I8 = Number<8>{}; - constexpr auto I9 = Number<9>{}; - - constexpr auto desc = fwd(TDesc{}); - + static_if{}([&](auto) { printf("%s dim %u, lengths {%u %u %u %u %u %u %u %u %u %u}, strides {%u %u %u %u %u %u %u " - "%u %u %u}, ranks {%u %u %u %u %u %u %u %u %u %u}\n", + "%u %u %u}\n", s, - desc.GetNumOfDimension(), - desc.GetLength(I0), - desc.GetLength(I1), - desc.GetLength(I2), - desc.GetLength(I3), - desc.GetLength(I4), - desc.GetLength(I5), - desc.GetLength(I6), - desc.GetLength(I7), - desc.GetLength(I8), - desc.GetLength(I9), - desc.GetStride(I0), - desc.GetStride(I1), - desc.GetStride(I2), - desc.GetStride(I3), - desc.GetStride(I4), - desc.GetStride(I5), - desc.GetStride(I6), - desc.GetStride(I7), - desc.GetStride(I8), - desc.GetStride(I9), - desc.GetMemoryRank(I0), - desc.GetMemoryRank(I1), - desc.GetMemoryRank(I2), - desc.GetMemoryRank(I3), - desc.GetMemoryRank(I4), - desc.GetMemoryRank(I5), - desc.GetMemoryRank(I6), - desc.GetMemoryRank(I7), - desc.GetMemoryRank(I8), - desc.GetMemoryRank(I9)); + ndim, + Lengths..., + Strides...); }); } diff --git a/src/include/blockwise_4d_tensor_op.hip.hpp b/src/include/blockwise_4d_tensor_op.hip.hpp index fdb32e8f05..4d1262226d 100644 --- a/src/include/blockwise_4d_tensor_op.hip.hpp +++ b/src/include/blockwise_4d_tensor_op.hip.hpp @@ -13,7 +13,7 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst constexpr auto dst_desc = DstDesc{}; - constexpr auto desc = make_ConstantTensorDescriptor_default_rank_packed(dst_desc.GetLengths()); + constexpr auto desc = make_ConstantTensorDescriptor_packed(dst_desc.GetLengths()); #if 0 if(get_thread_local_1d_id() == 0) @@ -108,7 +108,7 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds constexpr auto src_desc = SrcDesc{}; constexpr auto dst_desc = DstDesc{}; - constexpr auto ref_desc = make_ConstantTensorDescriptor_default_rank_packed(SrcOpLengths{}); + constexpr auto ref_desc = make_ConstantTensorDescriptor_packed(SrcOpLengths{}); constexpr index_t NLoop = ref_desc.GetElementSize() / BlockSize; @@ -259,7 +259,7 @@ struct Blockwise4dTensorCopy1 constexpr index_t read_per_d3 = mod_conv::integer_divide_ceil(L3, DataPerRead); constexpr auto ref_desc = - make_ConstantTensorDescriptor_default_rank_packed(Sequence{}); + make_ConstantTensorDescriptor_packed(Sequence{}); constexpr index_t NLoop = ref_desc.GetElementSize() / BlockSize; @@ -336,7 +336,7 @@ struct BlockwiseChwnTensorCopyPadded constexpr auto src_desc = SrcDesc{}; constexpr auto dst_desc = DstDesc{}; - constexpr auto ref_desc = make_ConstantTensorDescriptor_default_rank_packed(DstOpLengths{}); + constexpr auto ref_desc = make_ConstantTensorDescriptor_packed(DstOpLengths{}); constexpr auto h_global_pad_low = GlobalLowerPads{}.Get(I0); constexpr auto w_global_pad_low = GlobalLowerPads{}.Get(I1); @@ -510,8 +510,7 @@ struct Blockwise4dTensorCopy3 } } - constexpr auto thread_cluster_desc = - make_ConstantTensorDescriptor_default_rank_packed(ThreadPerDims{}); + constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_packed(ThreadPerDims{}); const auto thread_multi_id = thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id()); @@ -653,7 +652,7 @@ struct Blockwise4dTensorCopy3 constexpr index_t nloop_d2 = L2 / thread_per_d2; constexpr index_t nloop_d3 = mod_conv::integer_divide_ceil(L3, thread_per_d3 * DataPerRead); - constexpr auto clipboard_desc = make_ConstantTensorDescriptor_default_rank_packed( + constexpr auto clipboard_desc = make_ConstantTensorDescriptor_packed( Sequence{}); #pragma unroll @@ -720,7 +719,7 @@ struct Blockwise4dTensorCopy3 constexpr index_t nloop_d2 = L2 / thread_per_d2; constexpr index_t nloop_d3 = mod_conv::integer_divide_ceil(L3, thread_per_d3 * DataPerRead); - constexpr auto clipboard_desc = make_ConstantTensorDescriptor_default_rank_packed( + constexpr auto clipboard_desc = make_ConstantTensorDescriptor_packed( Sequence{}); #pragma unroll diff --git a/src/include/blockwise_generic_tensor_slice_op.hip.hpp b/src/include/blockwise_generic_tensor_slice_op.hip.hpp index d2de349d74..3f0bf5690f 100644 --- a/src/include/blockwise_generic_tensor_slice_op.hip.hpp +++ b/src/include/blockwise_generic_tensor_slice_op.hip.hpp @@ -63,7 +63,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 "wrong!"); // thread cluster - constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_default_rank_packed( + constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_packed( DataClusterLengths{}.ReorderGivenNew2Old(ThreadClusterArrangeOrder{})); // BlockSize @@ -185,7 +185,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * DataClusterLengths{}); constexpr auto thread_tensor_desc = - make_ConstantTensorDescriptor_default_rank_packed(SubLengths{} * repeat_lengths); + make_ConstantTensorDescriptor_packed(SubLengths{} * repeat_lengths); return thread_tensor_desc.GetElementSpace(); } @@ -199,8 +199,8 @@ struct BlockwiseGenericTensorSliceCopy_v1 constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * DataClusterLengths{}); - constexpr auto thread_tensor_desc = make_ConstantTensorDescriptor_default_rank_packed( - thread_sub_tensor_lengths * repeat_lengths); + constexpr auto thread_tensor_desc = + make_ConstantTensorDescriptor_packed(thread_sub_tensor_lengths * repeat_lengths); static_ford{}([&](auto repeat_multi_id_) { constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){}); @@ -237,8 +237,8 @@ struct BlockwiseGenericTensorSliceCopy_v1 constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * DataClusterLengths{}); - constexpr auto thread_tensor_desc = make_ConstantTensorDescriptor_default_rank_packed( - thread_sub_tensor_lengths * repeat_lengths); + constexpr auto thread_tensor_desc = + make_ConstantTensorDescriptor_packed(thread_sub_tensor_lengths * repeat_lengths); static_ford{}([&](auto repeat_multi_id_) { constexpr auto repeat_multi_id = sequence2array(decltype(repeat_multi_id_){}); diff --git a/src/include/blockwise_tensor_slice_op.hip.hpp b/src/include/blockwise_tensor_slice_op.hip.hpp index 2e6569e652..007b371cea 100644 --- a/src/include/blockwise_tensor_slice_op.hip.hpp +++ b/src/include/blockwise_tensor_slice_op.hip.hpp @@ -40,7 +40,7 @@ struct BlockwiseTensorSliceReorderCopy_v3 src_cluster_lengths.ReorderGivenNew2Old(map_thread_cluster_2_src_cluster); constexpr auto thread_cluster_desc = - make_ConstantTensorDescriptor_default_rank_packed(thread_cluster_lengths); + make_ConstantTensorDescriptor_packed(thread_cluster_lengths); // sanity check: data type static_assert(is_same::value, "wrong! only support float for now!\n"); @@ -175,7 +175,7 @@ struct BlockwiseTensorSliceReorderCopy_v3 constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths; constexpr auto thread_tensor_desc = - make_ConstantTensorDescriptor_default_rank_packed(thread_tensor_lengths); + make_ConstantTensorDescriptor_packed(thread_tensor_lengths); return thread_tensor_desc.GetElementSpace(); } @@ -196,7 +196,7 @@ struct BlockwiseTensorSliceReorderCopy_v3 constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths; constexpr auto thread_tensor_desc = - make_ConstantTensorDescriptor_default_rank_packed(thread_tensor_lengths); + make_ConstantTensorDescriptor_packed(thread_tensor_lengths); static_ford{}([&](auto repeat_multi_id_) { constexpr auto repeat_multi_id = decltype(repeat_multi_id_){}; @@ -234,7 +234,7 @@ struct BlockwiseTensorSliceReorderCopy_v3 constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths; constexpr auto thread_tensor_desc = - make_ConstantTensorDescriptor_default_rank_packed(thread_tensor_lengths); + make_ConstantTensorDescriptor_packed(thread_tensor_lengths); static_ford{}([&](auto repeat_multi_id_) { constexpr auto repeat_multi_id = decltype(repeat_multi_id_){}; diff --git a/src/include/conv_common.hip.hpp b/src/include/conv_common.hip.hpp index 2eb5a83d30..6fe7104be3 100644 --- a/src/include/conv_common.hip.hpp +++ b/src/include/conv_common.hip.hpp @@ -30,7 +30,7 @@ __host__ __device__ constexpr auto get_convolution_output_default_4d_tensor_desc constexpr auto HO = HI + 1 - Y; constexpr auto WO = WI + 1 - X; - return make_ConstantTensorDescriptor_default_rank_packed(Sequence{}); + return make_ConstantTensorDescriptor_packed(Sequence{}); } template @@ -67,7 +67,7 @@ __host__ __device__ constexpr auto get_convolution_with_padding_output_default_4 constexpr auto HO = HI + HPadLow + HPadUp + 1 - Y; constexpr auto WO = WI + WPadLow + WPadUp + 1 - X; - return make_ConstantTensorDescriptor_default_rank_packed(Sequence{}); + return make_ConstantTensorDescriptor_packed(Sequence{}); } template diff --git a/src/include/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hip.hpp b/src/include/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hip.hpp index e2b009da06..04ec8f4c62 100644 --- a/src/include/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hip.hpp +++ b/src/include/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hip.hpp @@ -45,23 +45,23 @@ struct GridwiseConvolutionDirect_v2_nchw_kcyx_nkhw constexpr index_t Y = wei_kcyx_global_desc.GetLength(I2); constexpr index_t X = wei_kcyx_global_desc.GetLength(I3); - constexpr auto wei_ke_global_desc = make_ConstantTensorDescriptor_default_rank_packed( + constexpr auto wei_ke_global_desc = make_ConstantTensorDescriptor_packed( Sequence{}); // 2d view of wei for blockwise copy constexpr index_t HiPerBlock = HoPerBlock + Y - 1; constexpr index_t WiPerBlock = WoPerBlock + X - 1; - constexpr auto in_nchw_block_desc = make_ConstantTensorDescriptor_default_rank_aligned( + constexpr auto in_nchw_block_desc = make_ConstantTensorDescriptor_aligned( Sequence{}, Number{}); - constexpr auto wei_ke_block_desc = make_ConstantTensorDescriptor_default_rank_aligned( + constexpr auto wei_ke_block_desc = make_ConstantTensorDescriptor_aligned( Sequence{}, Number{}); // 2d view of wei for blockwise copy - constexpr auto wei_kcyx_block_desc = make_ConstantTensorDescriptor_default_rank( - Sequence{}, - Sequence{}); + constexpr auto wei_kcyx_block_desc = + make_ConstantTensorDescriptor(Sequence{}, + Sequence{}); // shared mem constexpr index_t in_block_element_size = @@ -82,11 +82,11 @@ struct GridwiseConvolutionDirect_v2_nchw_kcyx_nkhw constexpr index_t HiPerThread = HoPerThread + Y - 1; constexpr index_t WiPerThread = WoPerThread + X - 1; - constexpr auto in_nchw_thread_block_desc = make_ConstantTensorDescriptor_default_rank( + constexpr auto in_nchw_thread_block_desc = make_ConstantTensorDescriptor( Sequence{}, in_nchw_block_desc.GetStrides()); - constexpr auto wei_kcyx_thread_block_desc = make_ConstantTensorDescriptor_default_rank( + constexpr auto wei_kcyx_thread_block_desc = make_ConstantTensorDescriptor( Sequence{}, wei_kcyx_block_desc.GetStrides()); constexpr auto out_nkhw_thread_desc = get_convolution_output_default_4d_tensor_descriptor( diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp index 17a2bc1fdb..ca0ac22944 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp @@ -85,7 +85,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn constexpr index_t WBlockWork = mod_conv::integer_divide_ceil(Wo, WoPerBlock); constexpr index_t NBlockWork = mod_conv::integer_divide_ceil(N, NPerBlock); - constexpr auto block_work_desc = make_ConstantTensorDescriptor_default_rank_packed( + constexpr auto block_work_desc = make_ConstantTensorDescriptor_packed( Sequence{}); const auto block_work_multi_id = @@ -109,7 +109,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn GemmDataPerReadA, GemmDataPerReadB); - constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_default_rank_aligned( + constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_aligned( Sequence{}, Number{}); @@ -118,12 +118,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn static_assert(in_c_h_w_n_block_desc.GetStride(I1) % GemmDataPerReadB == 0, "GemmDataPerReadB alignment requirement is not meet"); - constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_default_rank_aligned( + constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned( Sequence{}, Number{}); // tensor view of threadwise output in register - constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_default_rank_packed( + constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_packed( Sequence{}); // blockwise copy diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp index ddccdd8da6..e1d23053f1 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp @@ -86,7 +86,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn constexpr index_t HBlockWork = mod_conv::integer_divide_ceil(Ho, HoPerBlock); constexpr index_t WBlockWork = mod_conv::integer_divide_ceil(Wo, WoPerBlock); - constexpr auto block_work_desc = make_ConstantTensorDescriptor_default_rank_packed( + constexpr auto block_work_desc = make_ConstantTensorDescriptor_packed( Sequence{}); const auto block_work_multi_id = @@ -102,7 +102,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn // global tensor view constexpr auto wei_c_k_global_desc = - make_ConstantTensorDescriptor_default_rank(Sequence{}, Sequence{}); + make_ConstantTensorDescriptor(Sequence{}, Sequence{}); // LDS tensor view // be careful of alignment @@ -111,7 +111,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn GemmDataPerReadA, GemmDataPerReadB); - constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_default_rank_aligned( + constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_aligned( Sequence{}, Number{}); @@ -120,12 +120,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn static_assert(in_c_h_w_n_block_desc.GetStride(I1) % GemmDataPerReadB == 0, "GemmDataPerReadB alignment requirement is not meet"); - constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_default_rank_aligned( + constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned( Sequence{}, Number{}); // tensor view of threadwise output in register - constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_default_rank_packed( + constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_packed( Sequence{}); // blockwise copy @@ -448,10 +448,10 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn constexpr index_t K1 = KPerBlock / KPerThread; #if 0 - constexpr auto out_10d_global_desc = make_ConstantTensorDescriptor_default_rank_packed( + constexpr auto out_10d_global_desc = make_ConstantTensorDescriptor_packed( Sequence{}); - constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor_default_rank_packed( + constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor_packed( Sequence{}); #else constexpr auto out_10d_global_desc = diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp index 6c219dc0f5..0f9a11e218 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp @@ -86,7 +86,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw constexpr index_t HBlockWork = mod_conv::integer_divide_ceil(Ho, HoPerBlock); constexpr index_t WBlockWork = mod_conv::integer_divide_ceil(Wo, WoPerBlock); - constexpr auto block_work_desc = make_ConstantTensorDescriptor_default_rank_packed( + constexpr auto block_work_desc = make_ConstantTensorDescriptor_packed( Sequence{}); const auto block_work_multi_id = @@ -110,7 +110,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw GemmDataPerReadA, GemmDataPerReadB); - constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_default_rank_aligned( + constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_aligned( Sequence{}, Number{}); @@ -119,12 +119,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw static_assert(in_c_h_w_n_block_desc.GetStride(I1) % GemmDataPerReadB == 0, "GemmDataPerReadB alignment requirement is not meet"); - constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_default_rank_aligned( + constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned( Sequence{}, Number{}); // tensor view of threadwise output in register - constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_default_rank_packed( + constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_packed( Sequence{}); // blockwise copy diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp index 8fc1d1f17f..6d55d66cf2 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp @@ -83,7 +83,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw constexpr index_t HBlockWork = mod_conv::integer_divide_ceil(Ho, HoPerBlock); constexpr index_t WBlockWork = mod_conv::integer_divide_ceil(Wo, WoPerBlock); - constexpr auto block_work_desc = make_ConstantTensorDescriptor_default_rank_packed( + constexpr auto block_work_desc = make_ConstantTensorDescriptor_packed( Sequence{}); const auto block_work_multi_id = @@ -99,7 +99,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw // global tensor view constexpr auto wei_c_k_global_desc = - make_ConstantTensorDescriptor_default_rank(Sequence{}, Sequence{}); + make_ConstantTensorDescriptor(Sequence{}, Sequence{}); // LDS tensor view // be careful of alignment @@ -108,7 +108,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw GemmDataPerReadA, GemmDataPerReadB); - constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_default_rank_aligned( + constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_aligned( Sequence{}, Number{}); @@ -117,12 +117,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw static_assert(in_c_h_w_n_block_desc.GetStride(I1) % GemmDataPerReadB == 0, "GemmDataPerReadB alignment requirement is not meet"); - constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_default_rank_aligned( + constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned( Sequence{}, Number{}); // tensor view of threadwise output in register - constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_default_rank_packed( + constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_packed( Sequence{}); // blockwise copy diff --git a/src/include/gridwise_convolution_implicit_gemm_v3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp index 8076fa8d2a..ca1b9e8c2e 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp @@ -88,7 +88,7 @@ struct GridwiseConvolutionImplicitGemm_v3_lds_double_buffer_nchw_cyxk_nkhw constexpr index_t BBlockWork = B / BPerBlock; constexpr auto block_work_desc = - make_ConstantTensorDescriptor_default_rank_packed(Sequence{}); + make_ConstantTensorDescriptor_packed(Sequence{}); const auto block_work_multi_id = block_work_desc.GetMultiIndexFrom1dIndex(get_block_1d_id()); @@ -111,9 +111,8 @@ struct GridwiseConvolutionImplicitGemm_v3_lds_double_buffer_nchw_cyxk_nkhw // memory layout descriptor in LDS [C, N1, B, N2], dst of blockwise copy // be careful of LDS alignment - constexpr auto in_c_n1_b_n2_block_mem_desc = - make_ConstantTensorDescriptor_default_rank_aligned( - Sequence{}, Number{}); + constexpr auto in_c_n1_b_n2_block_mem_desc = make_ConstantTensorDescriptor_aligned( + Sequence{}, Number{}); // this check is ad-hoc // TODO: need to properly implement tensor descriptor with alignment @@ -143,7 +142,7 @@ struct GridwiseConvolutionImplicitGemm_v3_lds_double_buffer_nchw_cyxk_nkhw // tensor descriptor in LDS, dst of blockwise copy // be careful of LDS alignment - constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_default_rank_aligned( + constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned( Sequence{}, Number{}); @@ -367,7 +366,7 @@ struct GridwiseConvolutionImplicitGemm_v3_lds_double_buffer_nchw_cyxk_nkhw // define tensor descriptor for threadwise copy // output memory layout descriptor in register constexpr auto out_k0_k1_k2_n1_n0_h_w_n2_thread_mem_desc = - make_ConstantTensorDescriptor_default_rank_packed( + make_ConstantTensorDescriptor_packed( Sequence{}); // output tensor descriptor in register, src of threadwise copy diff --git a/src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp index f680531d38..42dfa288e0 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp @@ -91,7 +91,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw constexpr index_t BBlockWork = B / BPerBlock; constexpr auto block_work_desc = - make_ConstantTensorDescriptor_default_rank_packed(Sequence{}); + make_ConstantTensorDescriptor_packed(Sequence{}); const auto block_work_multi_id = block_work_desc.GetMultiIndexFrom1dIndex(get_block_1d_id()); @@ -114,9 +114,8 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw // memory layout descriptor in LDS [C, N1, B, N2], dst of blockwise copy // be careful of LDS alignment - constexpr auto in_c_n1_b_n2_block_mem_desc = - make_ConstantTensorDescriptor_default_rank_aligned( - Sequence{}, Number{}); + constexpr auto in_c_n1_b_n2_block_mem_desc = make_ConstantTensorDescriptor_aligned( + Sequence{}, Number{}); // this check is ad-hoc // TODO: need to properly implement tensor descriptor with alignment @@ -146,7 +145,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw // tensor descriptor in LDS, dst of blockwise copy // be careful of LDS alignment - constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_default_rank_aligned( + constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned( Sequence{}, Number{}); @@ -320,7 +319,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw // define tensor descriptor for threadwise copy // output memory layout descriptor in register constexpr auto out_k0_k1_k2_n1_n0_h_w_n2_thread_mem_desc = - make_ConstantTensorDescriptor_default_rank_packed( + make_ConstantTensorDescriptor_packed( Sequence{}); // output tensor descriptor in register, src of threadwise copy diff --git a/src/include/gridwise_convolution_implicit_gemm_v4_lds_double_buffer_nchw_kcyx_nkhw.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v4_lds_double_buffer_nchw_kcyx_nkhw.hip.hpp index 8a6648db04..6c7b77e46d 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v4_lds_double_buffer_nchw_kcyx_nkhw.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v4_lds_double_buffer_nchw_kcyx_nkhw.hip.hpp @@ -99,7 +99,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw constexpr index_t BBlockWork = B / BPerBlock; constexpr auto block_work_desc = - make_ConstantTensorDescriptor_default_rank_packed(Sequence{}); + make_ConstantTensorDescriptor_packed(Sequence{}); const auto block_work_multi_id = block_work_desc.GetMultiIndexFrom1dIndex(get_block_1d_id()); @@ -127,20 +127,9 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw Sequence<3, 6, 7>{}, Sequence<5>{}); -#if 0 - if(get_block_1d_id() == 0 && get_thread_local_1d_id() == 0) - { - print_ConstantTensorDescriptor(in_n0_n1_n2_h_w_global_desc, - "in_n0_n1_n2_h_w_global_desc: "); - print_ConstantTensorDescriptor(in_c_y_x_global_desc, "in_c_y_x_global_desc: "); - print_ConstantMergedTensorDescriptor(in_e_n1_b_n2_global_merged_desc, - "in_e_n1_b_n2_global_merged_desc: "); - } -#endif - // memory layout descriptor in LDS [E, N1, B, N2], dst of blockwise copy // be careful of LDS alignment - constexpr auto in_e_n1_b_n2_block_desc = make_ConstantTensorDescriptor_default_rank_aligned( + constexpr auto in_e_n1_b_n2_block_desc = make_ConstantTensorDescriptor_aligned( Sequence{}, Number{}); // this check is ad-hoc @@ -174,7 +163,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw // tensor descriptor in LDS, dst of blockwise copy // be careful of LDS alignment - constexpr auto wei_e_k_block_desc = make_ConstantTensorDescriptor_default_rank_aligned( + constexpr auto wei_e_k_block_desc = make_ConstantTensorDescriptor_aligned( Sequence{}, Number{}); @@ -406,7 +395,7 @@ struct GridwiseConvolutionImplicitGemm_v4_lds_double_buffer_nchw_kcyx_nkhw // define tensor descriptor for threadwise copy // output memory layout descriptor in register constexpr auto out_k0_k1_k2_n1_n0_h_w_n2_thread_mem_desc = - make_ConstantTensorDescriptor_default_rank_packed( + make_ConstantTensorDescriptor_packed( Sequence{}); // output tensor descriptor in register, src of threadwise copy diff --git a/src/include/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hip.hpp index c87f933123..0702204821 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hip.hpp @@ -93,7 +93,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw constexpr index_t BBlockWork = B / BPerBlock; constexpr auto block_work_desc = - make_ConstantTensorDescriptor_default_rank_packed(Sequence{}); + make_ConstantTensorDescriptor_packed(Sequence{}); const auto block_work_multi_id = block_work_desc.GetMultiIndexFrom1dIndex(get_block_1d_id()); @@ -134,7 +134,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw // memory layout descriptor in LDS [E, N1, B, N2], dst of blockwise copy // be careful of LDS alignment - constexpr auto in_e_n1_b_n2_block_desc = make_ConstantTensorDescriptor_default_rank_aligned( + constexpr auto in_e_n1_b_n2_block_desc = make_ConstantTensorDescriptor_aligned( Sequence{}, Number{}); // this check is ad-hoc @@ -167,7 +167,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw // tensor descriptor in LDS, dst of blockwise copy // be careful of LDS alignment - constexpr auto wei_e_k_block_desc = make_ConstantTensorDescriptor_default_rank_aligned( + constexpr auto wei_e_k_block_desc = make_ConstantTensorDescriptor_aligned( Sequence{}, Number{}); @@ -288,7 +288,7 @@ struct GridwiseConvolutionImplicitGemm_v4_nchw_kcyx_nkhw // define tensor descriptor for threadwise copy // output memory layout descriptor in register constexpr auto out_k0_k1_k2_n1_n0_h_w_n2_thread_mem_desc = - make_ConstantTensorDescriptor_default_rank_packed( + make_ConstantTensorDescriptor_packed( Sequence{}); // output tensor descriptor in register, src of threadwise copy diff --git a/src/include/threadwise_direct_convolution.hip.hpp b/src/include/threadwise_direct_convolution.hip.hpp index e6b2fdd8c2..3ba4a8dd4e 100644 --- a/src/include/threadwise_direct_convolution.hip.hpp +++ b/src/include/threadwise_direct_convolution.hip.hpp @@ -80,10 +80,8 @@ __device__ void threadwise_direct_convolution_2(InDesc, constexpr auto wei_desc = WeiDesc{}; constexpr auto out_desc = OutDesc{}; - constexpr auto in_reg_desc = - make_ConstantTensorDescriptor_default_rank_packed(in_desc.GetLengths()); - constexpr auto wei_reg_desc = - make_ConstantTensorDescriptor_default_rank_packed(wei_desc.GetLengths()); + constexpr auto in_reg_desc = make_ConstantTensorDescriptor_packed(in_desc.GetLengths()); + constexpr auto wei_reg_desc = make_ConstantTensorDescriptor_packed(wei_desc.GetLengths()); // register TInWei p_in_reg[in_reg_desc.GetElementSpace()]; diff --git a/src/include/threadwise_tensor_slice_op.hip.hpp b/src/include/threadwise_tensor_slice_op.hip.hpp index 1ff6afe68d..3d69810316 100644 --- a/src/include/threadwise_tensor_slice_op.hip.hpp +++ b/src/include/threadwise_tensor_slice_op.hip.hpp @@ -19,7 +19,7 @@ __device__ void threadwise_tensor_slice_copy(SrcDesc, constexpr auto src_desc = SrcDesc{}; constexpr auto dst_desc = DstDesc{}; - constexpr auto ref_desc = make_ConstantTensorDescriptor_default_rank_packed(SrcOpLengths{}); + constexpr auto ref_desc = make_ConstantTensorDescriptor_packed(SrcOpLengths{}); #if 0 if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)