diff --git a/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp b/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp index 4e739a001e..9a4e28b410 100644 --- a/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp +++ b/driver/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp @@ -38,7 +38,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, constexpr index_t X = wei_kcyx_desc.GetLength(I3); // reorder weight - auto wei_cyxk_desc = make_ConstantTensorDescriptor(Sequence{}); + auto wei_cyxk_desc = make_packed_ConstantTensorDescriptor(Sequence{}); ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: "); Tensor wei_cyxk(make_TensorDescriptor(wei_cyxk_desc)); @@ -51,7 +51,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, std::thread::hardware_concurrency()); // reorder input - auto in_chwn_desc = make_ConstantTensorDescriptor(Sequence{}); + auto in_chwn_desc = make_packed_ConstantTensorDescriptor(Sequence{}); ostream_ConstantTensorDescriptor(in_chwn_desc, std::cout << "in_chwn_desc: "); Tensor in_chwn(make_TensorDescriptor(in_chwn_desc)); @@ -64,7 +64,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, std::thread::hardware_concurrency()); // output - auto out_khwn_desc = make_ConstantTensorDescriptor(Sequence{}); + auto out_khwn_desc = make_packed_ConstantTensorDescriptor(Sequence{}); ostream_ConstantTensorDescriptor(out_khwn_desc, std::cout << "out_khwn_desc: "); Tensor out_khwn(make_TensorDescriptor(out_khwn_desc)); diff --git a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp index 5ea9b8f030..80c15a1a50 100644 --- a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp +++ b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_khwn.hpp @@ -37,7 +37,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc, constexpr index_t X = wei_kcyx_desc.GetLength(I3); // reorder weight - auto wei_cyxk_desc = make_ConstantTensorDescriptor(Sequence{}); + auto wei_cyxk_desc = make_packed_ConstantTensorDescriptor(Sequence{}); ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: "); Tensor wei_cyxk(make_TensorDescriptor(wei_cyxk_desc)); @@ -50,7 +50,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_khwn(InDesc, std::thread::hardware_concurrency()); // output - auto out_khwn_desc = make_ConstantTensorDescriptor(Sequence{}); + auto out_khwn_desc = make_packed_ConstantTensorDescriptor(Sequence{}); ostream_ConstantTensorDescriptor(out_khwn_desc, std::cout << "out_khwn_desc: "); Tensor out_khwn(make_TensorDescriptor(out_khwn_desc)); diff --git a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp index 34ed48229f..6a005c5dd6 100644 --- a/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp +++ b/driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp @@ -36,7 +36,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc, constexpr index_t X = wei_kcyx_desc.GetLength(I3); // reorder weight - auto wei_cyxk_desc = make_ConstantTensorDescriptor(Sequence{}); + auto wei_cyxk_desc = make_packed_ConstantTensorDescriptor(Sequence{}); ostream_ConstantTensorDescriptor(wei_cyxk_desc, std::cout << "wei_cyxk_desc: "); Tensor wei_cyxk(make_TensorDescriptor(wei_cyxk_desc)); diff --git a/driver/driver.hip.cpp b/driver/driver.hip.cpp index ef0db9e5b7..3adab07387 100644 --- a/driver/driver.hip.cpp +++ b/driver/driver.hip.cpp @@ -548,8 +548,8 @@ int main(int argc, char* argv[]) auto lower_pads = Sequence{}; auto upper_pads = Sequence{}; - auto in_nchw_desc = make_ConstantTensorDescriptor(Sequence{}); - auto wei_kcyx_desc = make_ConstantTensorDescriptor(Sequence{}); + auto in_nchw_desc = make_packed_ConstantTensorDescriptor(Sequence{}); + auto wei_kcyx_desc = make_packed_ConstantTensorDescriptor(Sequence{}); auto out_nkhw_desc = get_convolution_with_padding_output_default_4d_tensor_descriptor( in_nchw_desc, wei_kcyx_desc, lower_pads, upper_pads); diff --git a/src/include/Array.hip.hpp b/src/include/Array.hip.hpp index 30e3bd0b7c..103639e7e8 100644 --- a/src/include/Array.hip.hpp +++ b/src/include/Array.hip.hpp @@ -16,6 +16,8 @@ struct Array { } + __host__ __device__ constexpr index_t GetSize() const { return NSize; } + __host__ __device__ const TData& operator[](index_t i) const { return mData[i]; } __host__ __device__ TData& operator[](index_t i) { return mData[i]; } @@ -67,6 +69,23 @@ __host__ __device__ auto reorder_array_given_old2new(const Array& return new_array; } +template +__host__ __device__ auto extract_array(const Array& old_array, ExtractSeq) +{ + Array new_array; + + constexpr index_t new_size = ExtractSeq::GetSize(); + + static_assert(new_size <= NSize, "wrong! too many extract"); + + static_for<0, new_size, 1>{}([&](auto I) { + constexpr index_t i = I.Get(); + new_array[i] = old_array[ExtractSeq{}.Get(I)]; + }); + + return new_array; +} + template __host__ __device__ constexpr auto operator+(const Array& a, const Array& b) diff --git a/src/include/ConstantMatrixDescriptor.hip.hpp b/src/include/ConstantMatrixDescriptor.hip.hpp index c6ca3192e3..9cb3050382 100644 --- a/src/include/ConstantMatrixDescriptor.hip.hpp +++ b/src/include/ConstantMatrixDescriptor.hip.hpp @@ -21,7 +21,7 @@ struct ConstantMatrixDescriptor __host__ __device__ constexpr index_t GetElementSpace() const { return NRow_ * RowStride_; } - __host__ __device__ index_t Get1dIndex(index_t irow, index_t icol) const + __host__ __device__ index_t GetOffsetFromMultiIndex(index_t irow, index_t icol) const { return irow * RowStride_ + icol; } diff --git a/src/include/ConstantMergedTensorDescriptor.hip.hpp b/src/include/ConstantMergedTensorDescriptor.hip.hpp index 3c31da0c3c..5daecd7105 100644 --- a/src/include/ConstantMergedTensorDescriptor.hip.hpp +++ b/src/include/ConstantMergedTensorDescriptor.hip.hpp @@ -2,94 +2,118 @@ #include "common.hip.hpp" #include "ConstantTensorDescriptor.hip.hpp" -// TensorDesc: ConstantTensorDescriptor<...> -// MergedDimRanges: Sequence -template +// OriginalTensorDesc : ConstantTensorDescriptor<...> +// it's the tensor whose dimensions are to be merged +// OriginalDimMergeSeqs : Sequence<...>... +// each is a sequence of original dimensions (of OriginalTensorDesc) to be merged +template struct ConstantMergedTensorDescriptor { - static constexpr index_t nOriginalDim = GetNumOfOriginalDimension(); - static constexpr index_t nDim = GetNumOfDimension(); + static constexpr auto mOriginalDimMergeSeqs = std::tuple{}; + + static constexpr index_t nDim = std::tuple_size::value; + static constexpr index_t nOriginalDim = OriginalDesc::GetNumOfDimension(); - template __host__ __device__ constexpr ConstantMergedTensorDescriptor() { - constexpr auto merged_dim_ranges = std::make_tuple(MergedDimRanges{}...); + static_assert(nDim <= nOriginalDim, "wrong!"); - static_for<0, sizeof...(MergedDimRanges), 1>{}([&](auto I) { - constexpr index_t i = I.Get(); - constexpr auto merged_dim_range = std::get(merged_dim_ranges); + // TODO: check each of OriginalDimMergeSeqs contains at least 1, and at most + // OriginalTensorDesc::nDim number of dimensions - static_assert(merged_dim_range.GetSize() == 2, - "wrong! should specify first and last dimension to be merged"); - static_assert(merged_dim_range.Get(Number<0>{}) < GetNumOfUnmergedDimension(), - "wrong!"); - static_assert(merged_dim_range.Get(Number<1>{}) < GetNumOfUnmergedDimension(), - "wrong!"); - static_assert(merged_dim_range.Get(Number<0>{}) <= merged_dim_range.Get(Number<1>{}), - "wrong!"); - }); + // TODO: check there is no duplication in OriginalDimMergeSeqs + + // TODO: check OriginalDimMergeSeqs contains all original dimensions } - __host__ __device__ static constexpr index_t GetNumOfDimension() - { - constexpr auto merged_dim_ranges = std::make_tuple(MergedDimRanges...); + __host__ __device__ static constexpr index_t GetNumOfDimension() { return nDim; } - struct f_calculate_num_of_lost_dim - { - __host__ __device__ constexpr index_t operator()(auto I) const + __host__ __device__ static constexpr index_t GetNumOfOriginalDimension() { return nOriginalDim } + + template + __host__ __device__ static constexpr bool ContainMultipleOriginalDimensions(Number) + { + return (std::Get(mOriginalDimMergeSeqs).GetSize() > 1); + } + + template + __host__ __device__ static constexpr index_t GetLength(Number) + { + constexpr auto original_dims_partial = std::Get(mOriginalDimMergeSeqs); + + return OriginalTensorDesc::Extract(original_dims_partial).GetElementSize(); + } + + template + __host__ __device__ static constexpr index_t GetStride(Number) + { + static_assert(!ContainMultipleOriginalDimensions(Number{}), + "wrong! stride of a merged dimension is undefined"); + + constexpr auto idim_original = std::Get(mOriginalDimMergeSeqs).Front(); + + return OriginalTensorDesc::GetStride(Number{}); + } + + __host__ __device__ static constexpr auto GetLengths() + { + return Sequence{}; + } + + __host__ __device__ static constexpr index_t GetElementSize() + { + return OriginalTensorDesc::GetElementSize(); + } + + __host__ __device__ static auto + GetOriginalMultiIndexFromMultiIndex(Array multi_id) + { + Array original_multi_id; + + static_for<0, nDim, 1>{}([&](auto IDim) { + constexpr index_t idim = IDim.Get(); + constexpr auto original_dims_partial = std::get(mOriginalDimMergeSeqs); + + // get partial original-multi-id corresponding to this merged dimension + constexpr auto original_multi_id_partial = + OriginalTensorDesc::Extract(original_dims_partial) + .GetMultiIndexFrom1dIndex(multi_id[idim]); + + // make sure compiler unroll this loop and propagate all the constants + for(index_t i = 0; i < original_dims_partial.GetSize(); ++i) { - constexpr index_t i = I.Get(); - constexpr auto merged_dim_range = std::get(merged_dim_ranges); + index_t idim_original = original_dims_partial[i]; - return merged_dim_range.Get(Number<1>{}) - merged_dim_range.Get(Number<0>{}); + original_multi_id[idim_original] = original_multi_id_partial[i] } - }; + }); - constexpr index_t num_lost_dim = static_const_reduce_n{}( - f_calculate_num_of_lost_dim, std::plus{}); - - return TensorDesc::GetNumOfDimension() - num_lost_dim; + return original_multi_id; } - __host__ __device__ static constexpr index_t GetNumOfOriginalDimension() + __host__ __device__ static index_t GetOffsetFromMultiIndex(Array multi_id) { - return TensorDesc::GetNumOfDimension(); + const auto original_multi_id = GetOriginalMultiIndexFromMultiIndex(multi_id); + + return OriginalTensorDesc::GetOffsetFromMultiIndex(orginal_multi_id); } - template - __host__ __device__ static constexpr bool IsMergedDimension(Number) + template + __host__ __device__ static index_t GetOffsetFromMultiIndex(Is... is) { - // not implemented + return GetOffsetFromMultiIndex(Array{is...}); } - template - __host__ __device__ static constexpr bool GetLength(Number) + __host__ __device__ static Array GetMultiIndexFrom1dIndex(index_t id) { - // not implemented - } + constexpr auto dummy_desc = make_packed_ConstantTensorDescriptor(GetLengths()); - template - __host__ __device__ static constexpr bool GetStride(Number) - { - static_assert(!IsMergedDimension(Number{}, "wrong! stride of a merged dimension is undefined") - // not implemented - } - - template - __host__ __device__ auto MultiIndex2OriginalMultiIndex(Is... is) const - { - // not implemented - } - - template - __host__ __device__ auto OriginalMultiIndex2MultiIndex(Is... is) const - { - // not implemented + return dummy_desc.GetMultiIndexFrom1dIndex(id); } }; -template -constexpr auto make_ConstantMergedTensorDescriptor(TensorDesc, MergedDimRanges...) +template +constexpr auto make_ConstantMergedTensorDescriptor(OriginalTensorDesc, OriginalDimMergeSeqs...) { - return ConstantMergedTensorDescriptor{}; + return ConstantMergedTensorDescriptor{}; } diff --git a/src/include/ConstantTensorDescriptor.hip.hpp b/src/include/ConstantTensorDescriptor.hip.hpp index 45c779bd25..880ea5038f 100644 --- a/src/include/ConstantTensorDescriptor.hip.hpp +++ b/src/include/ConstantTensorDescriptor.hip.hpp @@ -2,40 +2,25 @@ #include "common.hip.hpp" template -__host__ __device__ constexpr auto calculate_default_strides(Lengths) +__host__ __device__ constexpr auto calculate_packed_tensor_strides(Lengths) { - return reverse_inclusive_scan_sequence(Lengths{}.PopFront().PushBack(Number<1>{}), - std::multiplies{}); + return reverse_inclusive_scan_sequence(Lengths{}.PopFront(), std::multiplies{}) + .PushBack(Number<1>{}); } -// this is ugly, only for 2d -template -__host__ __device__ constexpr auto calculate_default_strides_aligned(Sequence, - Number) +template +__host__ __device__ constexpr auto + calculate_rank_tensor_default_strides_with_alignment(Lengths, Number) { - constexpr index_t L1_align = Align * ((L1 + Align - 1) / Align); - return Sequence{}; + constexpr index_t L_back_align = + Align * mod_conv::integer_divide_ceiler{}(Lengths{}.Back(), Align); + + return calculate_packed_tensor_strides( + Lengths{}.Modify(Number{}, Number{})); } -// this is ugly, only for 3d -template -__host__ __device__ constexpr auto calculate_default_strides_aligned(Sequence, - Number) -{ - constexpr index_t L2_align = Align * ((L2 + Align - 1) / Align); - return Sequence{}; -} - -// this is ugly, only for 4d -template -__host__ __device__ constexpr auto calculate_default_strides_aligned(Sequence, - Number) -{ - constexpr index_t L3_align = Align * ((L3 + Align - 1) / Align); - return Sequence{}; -} - -template +// MemoryRanks of dimensions is for conversion from offset to multi-index +template struct ConstantTensorDescriptor { using Type = ConstantTensorDescriptor; @@ -44,14 +29,24 @@ struct ConstantTensorDescriptor __host__ __device__ constexpr ConstantTensorDescriptor() { - static_assert(Lengths::GetSize() == Strides::GetSize(), "nDim not consistent"); + static_assert(Lengths::GetSize() == Strides::GetSize() && + Lengths::GetSize() == MemoryRanks::GetSize(), + "nDim not consistent"); + +#if 0 // require sequence_sort, but it's not implemented yet + static_assert(is_same::SortedSeqType, + typename arithmetic_sequence_gen<0, nDim, 1>::SeqType>::value, + "wrong! invalid MemoryRanks"); +#endif } __host__ __device__ static constexpr index_t GetNumOfDimension() { return nDim; } - __host__ __device__ static constexpr Lengths GetLengths() { return Lengths{}; } + __host__ __device__ static constexpr auto GetLengths() { return Lengths{}; } - __host__ __device__ static constexpr Strides GetStrides() { return Strides{}; } + __host__ __device__ static constexpr auto GetStrides() { return Strides{}; } + + __host__ __device__ static constexpr auto GetMemoryRanks() { return MemoryRanks{}; } template __host__ __device__ static constexpr index_t GetLength(Number) @@ -65,47 +60,58 @@ struct ConstantTensorDescriptor return Strides{}.Get(Number{}); } + template + __host__ __device__ static constexpr index_t GetMemoryRank(Number) + { + return MemoryRanks{}.Get(Number{}); + } + __host__ __device__ static constexpr index_t GetElementSize() { return accumulate_on_sequence(Lengths{}, std::multiplies{}, Number<1>{}); } + // WRONG! ReorderGivenOld2New is broken template > __host__ __device__ static constexpr index_t GetElementSpace(Align align = Align{}) { +#if 0 + constexpr auto lengths_in_rank = GetLengths().ReorderGivenOld2New(MemoryRank{}); + constexpr auto strides_in_rank = GetStrides().ReorderGivenOld2new(MemoryRank{}); + + constexpr index_t element_space_unaligned = accumulate_on_sequence( + (lengths_in_rank - Number<1>{}) * strides_in_rank, std::plus{}, Number<1>{}); +#else // WRONG! align shouldbe applied to the last memory rank, not the last tensor dimension constexpr index_t element_space_unaligned = accumulate_on_sequence( (GetLengths() - Number<1>{}) * GetStrides(), std::plus{}, Number<1>{}); +#endif return align.Get() * ((element_space_unaligned + align.Get() - 1) / align.Get()); } template - __host__ __device__ static index_t Get1dIndex(Array multi_id) + __host__ __device__ static index_t GetOffsetFromMultiIndex(Array multi_id) { static_assert(NSize == nDim, "wrong! Dimension not consistent"); - index_t id = 0; + index_t offset = 0; static_for<0, nDim, 1>{}([&](auto IDim) { constexpr index_t idim = IDim.Get(); - id += multi_id[idim] * GetStride(IDim); + offset += multi_id[idim] * GetStride(IDim); }); - return id; + return offset; } template - __host__ __device__ static index_t Get1dIndex(Is... is) + __host__ __device__ static index_t GetOffsetFromMultiIndex(Is... is) { - static_assert(sizeof...(Is) == nDim, "number of multi-index is wrong"); - - const auto multi_id = Array(is...); - - return Get1dIndex(multi_id); + return GetOffsetFromMultiIndex(Array{is...}); } template - __host__ __device__ static constexpr index_t Get1dIndex(Sequence /*multi_id*/) + __host__ __device__ static constexpr index_t GetOffsetFromMultiIndex(Sequence) { static_assert(sizeof...(Is) == nDim, "wrong! Dimension not consistent"); @@ -114,44 +120,84 @@ struct ConstantTensorDescriptor return accumulate_on_sequence(multi_id * GetStrides(), std::plus{}, Number<0>{}); } - __host__ __device__ static Array GetMultiIndex(index_t id) +#if 0 // ReorderGivenOld2new is broken + __host__ __device__ static Array GetMultiIndexFromOffset(index_t offset) + { + Array ranked_multi_id; + + constexpr auto ranked_strides = + GetStrides().ReorderGivenOld2new(MemoryRanks{}); // check this + + // calculate index in each of the dimensions in the order of their rank (not dimension) + static_for<0, nDim - 1, 1>{}([&](auto IDim) { + constexpr index_t idim = IDim.Get(); + constexpr index_t stride = ranked_strides.Get(Number{}); + ranked_multi_id[idim] = offset / stride; + offset -= ranked_multi_id[idim] * stride; + }); + + ranked_multi_id[nDim - 1] = offset / ranked_strides.Get(Number{}); + + return reorder_array_given_new2old(ranked_multi_id, MemoryRanks{}); // check this + } +#endif + + __host__ __device__ static Array GetMultiIndexFrom1dIndex(index_t id) { Array multi_id; + constexpr auto dummy_strides = calculate_packed_tensor_strides(GetLengths()); + + // calculate index in each of the dimensions in the order of their dimension (not rank) static_for<0, nDim - 1, 1>{}([&](auto IDim) { - constexpr index_t idim = IDim.Get(); - multi_id[idim] = id / GetStride(IDim); - id -= multi_id[idim] * GetStride(IDim); + constexpr index_t idim = IDim.Get(); + constexpr index_t stride = dummy_strides.Get(Number{}); + multi_id[idim] = id / stride; + id -= multi_id[idim] * stride; }); - multi_id[nDim - 1] = id / GetStride(Number{}); + multi_id[nDim - 1] = id / dummy_strides.Get(Number{}); return multi_id; } - __host__ __device__ static constexpr auto Pack() - { - constexpr auto default_strides = calculate_default_strides(Lengths{}); - return ConstantTensorDescriptor{}; - } - + // WRONG! Ranks is broken template __host__ __device__ static constexpr auto Extract(Number... extract_dims) { static_assert(sizeof...(IDims) <= GetNumOfDimension(), "wrong! too many number of dimensions to be extracted"); - return make_ConstantTensorDescriptor(Lengths{}.Extract(extract_dims...), - Strides{}.Extract(extract_dims...)); + using extract_lengths = decltype(Lengths{}.Extract(extract_dims...)); + using extract_strides = decltype(Strides{}.Extract(extract_dims...)); + using extract_ranks = decltype(MemoryRanks{}.Extract(extract_dims...)); + +#if 0 + using new_ranks = typename sequence_sort::Original2SortedType; +#else // WRONG! TODO:: implement sequence_sort + using new_ranks = typename arithmetic_sequence_gen<0, sizeof...(IDims), 1>::SeqType; +#endif + + return ConstantTensorDescriptor{}; } template __host__ __device__ static constexpr auto Slice(Number, Number) { - return make_ConstantTensorDescriptor(Lengths{}.Modify(Number{}, Number{}), - Strides{}); + using slice_lengths = decltype(Lengths{}.Modify(Number{}, Number{})); + + return ConstantTensorDescriptor{}; } + template + struct f_fold_impl + { + __host__ __device__ constexpr index_t operator()(index_t x) const + { + return x > Threashold ? x + Delta : x; + } + }; + template __host__ __device__ static constexpr auto Fold(Number, Number...) { @@ -162,6 +208,7 @@ struct ConstantTensorDescriptor constexpr auto unfold_length = GetLength(Number{}); constexpr auto unfold_stride = GetStride(Number{}); + constexpr auto unfold_rank = GetMemoryRank(Number{}); // length of the dimension to be folded needs to be dividable by fold_interval_product, // otherwise, folding is invalid @@ -178,16 +225,45 @@ struct ConstantTensorDescriptor reverse_inclusive_scan_sequence(fold_intervals.PushBack(Number<1>{}), std::multiplies{}); - // left and right - constexpr auto left = make_increasing_sequence(Number<0>{}, Number{}, Number<1>{}); - constexpr auto right = make_increasing_sequence( - Number{}, Number{}, Number<1>{}); + // folded_ranks + constexpr auto fold_ranks = + typename arithmetic_sequence_gen::SeqType{}; - return make_ConstantTensorDescriptor( - GetLengths().Extract(left).Append(fold_lengths).Append(GetLengths().Extract(right)), - GetStrides().Extract(left).Append(fold_strides).Append(GetStrides().Extract(right))); + // increase the ranks that are larger than unfold_rank + constexpr auto tmp_ranks = transform_sequences( + f_fold_impl{}, GetMemoryRanks()); + + // left and right + constexpr auto left = typename arithmetic_sequence_gen<0, IDim, 1>::SeqType{}; + constexpr auto right = + typename arithmetic_sequence_gen::SeqType{}; + + constexpr auto new_lengths = + GetLengths().Extract(left).Append(fold_lengths).Append(GetLengths().Extract(right)); + constexpr auto new_strides = + GetStrides().Extract(left).Append(fold_strides).Append(GetStrides().Extract(right)); + constexpr auto new_ranks = + tmp_ranks.Extract(left).Append(fold_ranks).Append(tmp_ranks.Extract(right)); + + static_assert(new_ranks.GetSize() == new_lengths.GetSize(), "wrong!"); + static_assert(fold_ranks.GetSize() == fold_lengths.GetSize(), "wrong!"); + + return ConstantTensorDescriptor{}; } + template + struct f_unfold_impl + { + __host__ __device__ constexpr index_t operator()(index_t x) const + { + return x > Threashold ? x - Delta : x; + } + }; + template __host__ __device__ static constexpr auto Unfold(Number, Number) { @@ -198,66 +274,109 @@ struct ConstantTensorDescriptor // dimensions to be unfold need to be in descending order (w.r.t. strides), and need to be // packed in memory, otherwise, unfolding is invalid static_for{}([&](auto IDim) { + constexpr auto IDim_p1 = IDim + Number<1>{}; + + // check stride static_assert( - GetStride(IDim) >= GetStride(Number{}), + GetStride(IDim) >= GetStride(IDim_p1), "wrong! dimensions to be unfolded need to be in descending order w.r.t strides"); - static_assert(GetStride(IDim + 1) * GetLength(IDim + 1) == GetStride(IDim), + // check if packed + static_assert(GetStride(IDim_p1) * GetLength(IDim_p1) == GetStride(IDim), "wrong! dimensions to be unfolded need to be packed"); + + // checkt ranks + static_assert(GetMemoryRank(IDim_p1) = GetMemoryRank(IDim) + 1, + "wrong! ranks of dimensions to be " + "unfolded need to be in increasing " + "and continuous ranks"); }); // left and right - constexpr auto left = - make_increasing_sequence(Number<0>{}, Number{}, Number<1>{}); - constexpr auto middle = make_increasing_sequence( - Number{}, Number{}, Number<1>{}); - constexpr auto right = make_increasing_sequence( - Number{}, Number{}, Number<1>{}); + constexpr auto left = typename arithmetic_sequence_gen<0, FirstUnfoldDim, 1>::SeqType{}; + constexpr auto middle = + typename arithmetic_sequence_gen::SeqType{}; + constexpr auto right = + typename arithmetic_sequence_gen::SeqType{}; - // length and stride + // unfolded length, stride and rank constexpr index_t unfold_length = accumulate_on_sequence( GetLengths().Extract(middle), std::multiplies{}, Number<1>{}); constexpr index_t unfold_stride = GetStride(Number{}); - return make_ConstantTensorDescriptor(GetLengths() - .Extract(left) - .PushBack(Number{}) - .Append(GetLengths().Extract(right)), - GetStrides() - .Extract(left) - .PushBack(Number{}) - .Append(GetStrides().Extract(right))); + constexpr index_t unfold_rank = GetMemoryRank(Number{}); + + // decrease the ranks that are larger than the rank of LastUnfoldDim + constexpr auto tmp_ranks = + transform_sequences(GetMemoryRanks(), + f_unfold_impl{}), + LastUnfoldDim - FirstUnfoldDim + 1>{}); + + // new lengths, strides and ranks + constexpr auto new_lengths = GetLengths() + .Extract(left) + .PushBack(Number{}) + .Append(GetLengths().Extract(right)); + + constexpr auto new_strides = GetStrides() + .Extract(left) + .PushBack(Number{}) + .Append(GetStrides().Extract(right)); + + constexpr auto new_ranks = tmp_ranks.Extract(left) + .PushBack(Number{}) + .Append(tmp_ranks.Extract(right)); + + return ConstantTensorDescriptor{}; } - template - __host__ __device__ static constexpr auto ReorderGivenNew2Old(Sequence /*new2old*/) + template + __host__ __device__ static constexpr auto ReorderGivenNew2Old(MapNew2Old) { - static_assert(sizeof...(IRs) == GetNumOfDimension(), "wrong! dimension is wrong"); - constexpr auto map_new2old = Sequence{}; - return make_ConstantTensorDescriptor(Lengths{}.ReorderGivenNew2Old(map_new2old), - Strides{}.ReorderGivenNew2Old(map_new2old)); + return ConstantTensorDescriptor{}; } + +#if 0 // require sequence_sort, which is not implemented yet + template + __host__ __device__ static constexpr auto ReorderGivenOld2New(MapOld2New) + { + return ConstantTensorDescriptor{}; + } +#endif }; template -__host__ __device__ constexpr auto make_ConstantTensorDescriptor(Lengths) +__host__ __device__ constexpr auto make_packed_ConstantTensorDescriptor(Lengths) { - using Strides = decltype(calculate_default_strides(Lengths{})); - return ConstantTensorDescriptor{}; + using Strides = decltype(calculate_packed_tensor_strides(Lengths{})); + using MemoryRanks = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::SeqType; + return ConstantTensorDescriptor{}; } template -__host__ __device__ constexpr auto make_ConstantTensorDescriptor(Lengths, Strides) +__host__ __device__ constexpr auto make_ranked_ConstantTensorDescriptor(Lengths, Strides) { - return ConstantTensorDescriptor{}; + using MemoryRanks = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::SeqType; + return ConstantTensorDescriptor{}; } template -__host__ __device__ constexpr auto make_ConstantTensorDescriptor_aligned(Lengths, Number) +__host__ __device__ constexpr auto + make_ranked_ConstantTensorDescriptor_with_alignment(Lengths, Number) { - using Strides = decltype(calculate_default_strides_aligned(Lengths{}, Number{})); - return ConstantTensorDescriptor{}; + using Strides = + decltype(calculate_rank_tensor_default_strides_with_alignment(Lengths{}, Number{})); + using MemoryRanks = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::SeqType; + return ConstantTensorDescriptor{}; } template diff --git a/src/include/Sequence.hip.hpp b/src/include/Sequence.hip.hpp index a7ab687b07..b5a3aacbd2 100644 --- a/src/include/Sequence.hip.hpp +++ b/src/include/Sequence.hip.hpp @@ -9,76 +9,100 @@ struct Sequence static constexpr index_t mSize = sizeof...(Is); - const index_t mData[mSize + 1] = { - Is..., 0}; // the last element is dummy, to prevent compiler complain on empty Sequence - __host__ __device__ static constexpr index_t GetSize() { return mSize; } template - __host__ __device__ constexpr index_t Get(Number) const + __host__ __device__ static constexpr index_t Get(Number) { + static_assert(I < mSize, "wrong! I too large"); + + // the last dummy element is to prevent compiler complain about empty Sequence + const index_t mData[mSize + 1] = {Is..., 0}; return mData[I]; } - __host__ __device__ index_t operator[](index_t i) const { return mData[i]; } - - template - __host__ __device__ constexpr auto ReorderGivenNew2Old(Sequence /*new2old*/) const + __host__ __device__ index_t operator[](index_t i) const { - static_assert(mSize == sizeof...(IRs), "mSize not consistent"); - - constexpr auto old = Type{}; - - return Sequence{})...>{}; + const index_t mData[mSize + 1] = {Is..., 0}; + return mData[i]; } template - __host__ __device__ constexpr auto ReorderGivenOld2New(Sequence /*old2new*/) const + __host__ __device__ static constexpr auto ReorderGivenNew2Old(Sequence /*new2old*/) { - // TODO: don't know how to implement this - printf("Sequence::ReorderGivenOld2New not implemented"); - assert(false); +#if 0 // require sequence_sort, which is not implemented yet + static_assert(is_same>::SortedSeqType, + arithmetic_sequence_gen<0, mSize, 1>::SeqType>::value, + "wrong! invalid new2old map"); +#endif + + return Sequence{})...>{}; } - __host__ __device__ constexpr auto Reverse() const; +#if 0 // require sequence_sort, which is not implemented yet + template + __host__ __device__ static constexpr auto ReorderGivenOld2New(MapOld2New /*old2new*/) + { + static_assert(is_same::SortedSeqType, + arithmetic_sequence_gen<0, mSize, 1>::SeqType>::value, + "wrong! invalid old2new map"); - __host__ __device__ constexpr index_t Front() const { return mData[0]; } + constexpr auto map_new2old = typename sequence_map_inverse::SeqMapType{}; - __host__ __device__ constexpr index_t Back() const { return mData[mSize - 1]; } + return ReorderGivenNew2Old(map_new2old); + } +#endif + + __host__ __device__ static constexpr auto Reverse(); + + __host__ __device__ static constexpr index_t Front() + { + const index_t mData[mSize + 1] = {Is..., 0}; + return mData[0]; + } + + __host__ __device__ static constexpr index_t Back() + { + const index_t mData[mSize + 1] = {Is..., 0}; + return mData[mSize - 1]; + } template - __host__ __device__ constexpr auto PushFront(Number) const + __host__ __device__ static constexpr auto PushFront(Number) { return Sequence{}; } template - __host__ __device__ constexpr auto PushBack(Number) const + __host__ __device__ static constexpr auto PushBack(Number) { return Sequence{}; } - __host__ __device__ constexpr auto PopFront() const; + __host__ __device__ static constexpr auto PopFront(); - __host__ __device__ constexpr auto PopBack() const; + __host__ __device__ static constexpr auto PopBack(); template - __host__ __device__ constexpr auto Append(Sequence) const + __host__ __device__ static constexpr auto Append(Sequence) { return Sequence{}; } template - __host__ __device__ constexpr auto Extract(Number...) const + __host__ __device__ static constexpr auto Extract(Number...) { return Sequence{})...>{}; } template - __host__ __device__ constexpr auto Extract(Sequence) const + __host__ __device__ static constexpr auto Extract(Sequence) { return Sequence{})...>{}; } + + template + __host__ __device__ static constexpr auto Modify(Number, Number); }; template @@ -91,43 +115,36 @@ struct sequence_merge, Sequence> }; template -struct increasing_sequence_gen_impl +struct arithmetic_sequence_gen_impl { static constexpr index_t NSizeLeft = NSize / 2; using SeqType = typename sequence_merge< - typename increasing_sequence_gen_impl::SeqType, - typename increasing_sequence_gen_impl::SeqType, + typename arithmetic_sequence_gen_impl::SeqType>::SeqType; }; template -struct increasing_sequence_gen_impl +struct arithmetic_sequence_gen_impl { using SeqType = Sequence; }; template -struct increasing_sequence_gen_impl +struct arithmetic_sequence_gen_impl { using SeqType = Sequence<>; }; template -struct increasing_sequence_gen +struct arithmetic_sequence_gen { using SeqType = - typename increasing_sequence_gen_impl::SeqType; + typename arithmetic_sequence_gen_impl::SeqType; }; -template -__host__ __device__ constexpr auto - make_increasing_sequence(Number, Number, Number) -{ - return typename increasing_sequence_gen::SeqType{}; -} - template struct sequence_reverse_inclusive_scan; @@ -161,8 +178,8 @@ struct sequence_split { static constexpr index_t NSize = Seq{}.GetSize(); - using range0 = typename increasing_sequence_gen<0, I, 1>::SeqType; - using range1 = typename increasing_sequence_gen::SeqType; + using range0 = typename arithmetic_sequence_gen<0, I, 1>::SeqType; + using range1 = typename arithmetic_sequence_gen::SeqType; using SeqType0 = typename sequence_extract::SeqType; using SeqType1 = typename sequence_extract::SeqType; @@ -191,6 +208,63 @@ struct sequence_reverse> using SeqType = Sequence; }; +#if 0 // not fully implemented +template +struct sequence_sort_merge_impl; + +template +struct sequence_sort_merge_impl, + Sequence, + Sequence, + Sequence> +{ +}; + +template +struct sequence_sort; + +template +struct sequence_sort> +{ + using OriginalSeqType = Sequence; + using SortedSeqType = xxxxx; + using MapSorted2OriginalType = xxx; +}; + +template +struct sequence_map_inverse_impl; + +// impl for valid map, no impl for invalid map +template +struct sequence_map_inverse_impl, true> +{ + using SeqMapType = sequence_sort>::MapSorted2OriginalType; +}; + +template +struct sequence_map_inverse; + +template +struct sequence_map_inverse> +{ + // TODO: make sure the map to be inversed is valid: [0, sizeof...(Is)) + static constexpr bool is_valid_sequence_map = + is_same>::SortedSeqType, + typename arithmetic_sequence_gen<0, sizeof...(Is), 1>::SeqType>::value; + + // make compiler fails, if is_valid_map != true + using SeqMapType = + typename sequence_map_inverse_impl, is_valid_map>::SeqMapType; +}; +#endif + template __host__ __device__ constexpr auto operator+(Sequence, Sequence) { @@ -243,7 +317,7 @@ __host__ __device__ constexpr auto operator+(Sequence, Number) template __host__ __device__ constexpr auto operator-(Sequence, Number) { -#if 0 // doesn't compile +#if 0 // TODO: turn it on. Doesn't compile constexpr auto seq_x = Sequence{}; static_for<0, sizeof...(Xs), 1>{}([&](auto Iter) { @@ -313,14 +387,13 @@ __host__ __device__ constexpr auto operator%(Number, Sequence) template __host__ __device__ constexpr auto sequence_pop_front(Sequence) { - static_assert(sizeof...(Is) > 0, "empty Sequence!"); return Sequence{}; } template __host__ __device__ constexpr auto sequence_pop_back(Seq) { - static_assert(Seq{}.GetSize() > 0, "empty Sequence!"); + static_assert(Seq{}.GetSize() > 0, "wrong! cannot pop an empty Sequence!"); return sequence_pop_front(Seq{}.Reverse()).Reverse(); } @@ -349,16 +422,16 @@ transform_sequences(F f, Sequence, Sequence, Sequence) return Sequence{}; } -template -__host__ __device__ constexpr auto Sequence::PopFront() const +template +__host__ __device__ constexpr auto reverse_inclusive_scan_sequence(Seq, Reduce) { - return sequence_pop_front(Type{}); + return typename sequence_reverse_inclusive_scan::SeqType{}; } -template -__host__ __device__ constexpr auto Sequence::PopBack() const +template +__host__ __device__ constexpr auto inclusive_scan_sequence(Seq, Reduce) { - return sequence_pop_back(Type{}); + return reverse_inclusive_scan_sequence(Seq{}.Reverse(), Reduce{}).Reverse(); } template @@ -381,19 +454,32 @@ __host__ __device__ constexpr index_t } template -__host__ __device__ constexpr auto Sequence::Reverse() const +__host__ __device__ constexpr auto Sequence::PopFront() +{ + return sequence_pop_front(Type{}); +} + +template +__host__ __device__ constexpr auto Sequence::PopBack() +{ + return sequence_pop_back(Type{}); +} + +template +__host__ __device__ constexpr auto Sequence::Reverse() { return typename sequence_reverse>::SeqType{}; } -template -__host__ __device__ constexpr auto reverse_inclusive_scan_sequence(Seq, Reduce) +template +template +__host__ __device__ constexpr auto Sequence::Modify(Number, Number) { - return typename sequence_reverse_inclusive_scan::SeqType{}; -} + static_assert(I < GetSize(), "wrong!"); -template -__host__ __device__ constexpr auto inclusive_scan_sequence(Seq, Reduce) -{ - return reverse_inclusive_scan_sequence(Seq{}.Reverse(), Reduce{}).Reverse(); + using seq_split = sequence_split; + constexpr auto seq_left = typename seq_split::SeqType0{}; + constexpr auto seq_right = typename seq_split::SeqType1{}.PopFront(); + + return seq_left.PushBack(Number{}).Append(seq_right); } diff --git a/src/include/blockwise_2d_tensor_op.hip.hpp b/src/include/blockwise_2d_tensor_op.hip.hpp index a5c8f1ea9b..1753a48e87 100644 --- a/src/include/blockwise_2d_tensor_op.hip.hpp +++ b/src/include/blockwise_2d_tensor_op.hip.hpp @@ -33,7 +33,7 @@ blockwise_2d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst const index_t did1 = is / desc.GetStride(I1); - const index_t dindex = dst_desc.Get1dIndex(did0, did1); + const index_t dindex = dst_desc.GetOffsetFromMultiIndex(did0, did1); f(p_dst[dindex]); } @@ -52,7 +52,7 @@ blockwise_2d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst const index_t did1 = is / desc.GetStride(I1); - const index_t dindex = dst_desc.Get1dIndex(did0, did1); + const index_t dindex = dst_desc.GetOffsetFromMultiIndex(did0, did1); f(p_dst[dindex]); } @@ -102,9 +102,9 @@ __device__ void blockwise_2d_tensor_pointwise_operation_binary_reorder_by_get_ds did[1] = is / ref_desc.GetStride(I1); - const index_t aindex = src_desc.Get1dIndex(did[0], did[1]); + const index_t aindex = src_desc.GetOffsetFromMultiIndex(did[0], did[1]); - const index_t bindex = dst_desc.Get1dIndex(did[IR0], did[IR1]); + const index_t bindex = dst_desc.GetOffsetFromMultiIndex(did[IR0], did[IR1]); f(p_src[aindex], p_dst[bindex]); } @@ -125,9 +125,9 @@ __device__ void blockwise_2d_tensor_pointwise_operation_binary_reorder_by_get_ds did[1] = is / ref_desc.GetStride(I1); - const index_t aindex = src_desc.Get1dIndex(did[0], did[1]); + const index_t aindex = src_desc.GetOffsetFromMultiIndex(did[0], did[1]); - const index_t bindex = dst_desc.Get1dIndex(did[IR0], did[IR1]); + const index_t bindex = dst_desc.GetOffsetFromMultiIndex(did[IR0], did[IR1]); f(p_src[aindex], p_dst[bindex]); } @@ -224,8 +224,10 @@ struct Blockwise2dTensorCopy1 did[1] = is / ref_desc.GetStride(I1); - const index_t src_index = src_desc.Get1dIndex(did[0], did[1] * DataPerRead); - const index_t dst_index = dst_desc.Get1dIndex(did[0], did[1] * DataPerRead); + const index_t src_index = + src_desc.GetOffsetFromMultiIndex(did[0], did[1] * DataPerRead); + const index_t dst_index = + dst_desc.GetOffsetFromMultiIndex(did[0], did[1] * DataPerRead); *(reinterpret_cast(p_dst + dst_index)) = *(reinterpret_cast(p_src + src_index)); @@ -328,8 +330,8 @@ struct Blockwise2dTensorCopy2 { index_t did1 = d1v4loop * 4 * ThreadPerDim1 + 4 * mThreadId1; - const index_t sindex = src_desc.Get1dIndex(did0, did1); - const index_t dindex = dst_desc.Get1dIndex(did0, did1); + const index_t sindex = src_desc.GetOffsetFromMultiIndex(did0, did1); + const index_t dindex = dst_desc.GetOffsetFromMultiIndex(did0, did1); *(reinterpret_cast(p_dst + dindex)) = *(reinterpret_cast(p_src + sindex)); @@ -341,8 +343,8 @@ struct Blockwise2dTensorCopy2 index_t did1 = Dim1V4Loop * 4 * ThreadPerDim1 + d1v2loop * 2 * ThreadPerDim1 + 2 * mThreadId1; - const index_t sindex = src_desc.Get1dIndex(did0, did1); - const index_t dindex = dst_desc.Get1dIndex(did0, did1); + const index_t sindex = src_desc.GetOffsetFromMultiIndex(did0, did1); + const index_t dindex = dst_desc.GetOffsetFromMultiIndex(did0, did1); *(reinterpret_cast(p_dst + dindex)) = *(reinterpret_cast(p_src + sindex)); @@ -354,8 +356,8 @@ struct Blockwise2dTensorCopy2 index_t did1 = Dim1V4Loop * 4 * ThreadPerDim1 + Dim1V2Loop * 2 * ThreadPerDim1 + d1v1loop * ThreadPerDim1 + mThreadId1; - const index_t sindex = src_desc.Get1dIndex(did0, did1); - const index_t dindex = dst_desc.Get1dIndex(did0, did1); + const index_t sindex = src_desc.GetOffsetFromMultiIndex(did0, did1); + const index_t dindex = dst_desc.GetOffsetFromMultiIndex(did0, did1); p_dst[dindex] = p_src[sindex]; } @@ -368,8 +370,8 @@ struct Blockwise2dTensorCopy2 if(did1 < L1) { - const index_t sindex = src_desc.Get1dIndex(did0, did1); - const index_t dindex = dst_desc.Get1dIndex(did0, did1); + const index_t sindex = src_desc.GetOffsetFromMultiIndex(did0, did1); + const index_t dindex = dst_desc.GetOffsetFromMultiIndex(did0, did1); p_dst[dindex] = p_src[sindex]; } @@ -389,8 +391,8 @@ struct Blockwise2dTensorCopy2 { index_t did1 = d1v4loop * 4 * ThreadPerDim1 + 4 * mThreadId1; - const index_t sindex = src_desc.Get1dIndex(did0, did1); - const index_t dindex = dst_desc.Get1dIndex(did0, did1); + const index_t sindex = src_desc.GetOffsetFromMultiIndex(did0, did1); + const index_t dindex = dst_desc.GetOffsetFromMultiIndex(did0, did1); *(reinterpret_cast(p_dst + dindex)) = *(reinterpret_cast(p_src + sindex)); @@ -402,8 +404,8 @@ struct Blockwise2dTensorCopy2 index_t did1 = Dim1V4Loop * 4 * ThreadPerDim1 + d1v2loop * 2 * ThreadPerDim1 + 2 * mThreadId1; - const index_t sindex = src_desc.Get1dIndex(did0, did1); - const index_t dindex = dst_desc.Get1dIndex(did0, did1); + const index_t sindex = src_desc.GetOffsetFromMultiIndex(did0, did1); + const index_t dindex = dst_desc.GetOffsetFromMultiIndex(did0, did1); *(reinterpret_cast(p_dst + dindex)) = *(reinterpret_cast(p_src + sindex)); @@ -415,8 +417,8 @@ struct Blockwise2dTensorCopy2 index_t did1 = Dim1V4Loop * 4 * ThreadPerDim1 + Dim1V2Loop * 2 * ThreadPerDim1 + d1v1loop * ThreadPerDim1 + mThreadId1; - const index_t sindex = src_desc.Get1dIndex(did0, did1); - const index_t dindex = dst_desc.Get1dIndex(did0, did1); + const index_t sindex = src_desc.GetOffsetFromMultiIndex(did0, did1); + const index_t dindex = dst_desc.GetOffsetFromMultiIndex(did0, did1); p_dst[dindex] = p_src[sindex]; } @@ -429,8 +431,8 @@ struct Blockwise2dTensorCopy2 if(did1 < L1) { - const index_t sindex = src_desc.Get1dIndex(did0, did1); - const index_t dindex = dst_desc.Get1dIndex(did0, did1); + const index_t sindex = src_desc.GetOffsetFromMultiIndex(did0, did1); + const index_t dindex = dst_desc.GetOffsetFromMultiIndex(did0, did1); p_dst[dindex] = p_src[sindex]; } @@ -497,8 +499,10 @@ struct Blockwise2dTensorCopy3 const index_t thread_id_d0 = get_thread_local_1d_id() / thread_per_d1; const index_t thread_id_d1 = get_thread_local_1d_id() - thread_id_d0 * thread_per_d1; - mSrcMyThreadOffset = SrcDesc{}.Get1dIndex(thread_id_d0, thread_id_d1 * DataPerRead); - mDstMyThreadOffset = DstDesc{}.Get1dIndex(thread_id_d0, thread_id_d1 * DataPerRead); + mSrcMyThreadOffset = + SrcDesc{}.GetOffsetFromMultiIndex(thread_id_d0, thread_id_d1 * DataPerRead); + mDstMyThreadOffset = + DstDesc{}.GetOffsetFromMultiIndex(thread_id_d0, thread_id_d1 * DataPerRead); } __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const diff --git a/src/include/blockwise_3d_tensor_op.hip.hpp b/src/include/blockwise_3d_tensor_op.hip.hpp index 3e9ae6920c..e7e7ee5592 100644 --- a/src/include/blockwise_3d_tensor_op.hip.hpp +++ b/src/include/blockwise_3d_tensor_op.hip.hpp @@ -71,8 +71,10 @@ struct Blockwise3dTensorCopy1 did[2] = is / ref_desc.GetStride(I2); - const index_t src_index = src_desc.Get1dIndex(did[0], did[1], did[2] * DataPerRead); - const index_t dst_index = dst_desc.Get1dIndex(did[0], did[1], did[2] * DataPerRead); + const index_t src_index = + src_desc.GetOffsetFromMultiIndex(did[0], did[1], did[2] * DataPerRead); + const index_t dst_index = + dst_desc.GetOffsetFromMultiIndex(did[0], did[1], did[2] * DataPerRead); *(reinterpret_cast(p_dst + dst_index)) = *(reinterpret_cast(p_src + src_index)); @@ -167,12 +169,13 @@ struct Blockwise3dTensorCopy3 } constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor(ThreadPerDims{}); - const auto thread_multi_id = thread_cluster_desc.GetMultiIndex(get_thread_local_1d_id()); + const auto thread_multi_id = + thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id()); - mSrcMyThreadOffset = SrcDesc{}.Get1dIndex( + mSrcMyThreadOffset = SrcDesc{}.GetOffsetFromMultiIndex( thread_multi_id[0], thread_multi_id[1], thread_multi_id[2] * DataPerRead); - mDstMyThreadOffset = DstDesc{}.Get1dIndex( + mDstMyThreadOffset = DstDesc{}.GetOffsetFromMultiIndex( thread_multi_id[0], thread_multi_id[1], thread_multi_id[2] * DataPerRead); } @@ -214,14 +217,14 @@ struct Blockwise3dTensorCopy3 for(index_t iloop_d2 = 0; iloop_d2 < nloop_d2; ++iloop_d2) { const index_t src_offset = - SrcDesc{}.Get1dIndex(iloop_d0 * thread_per_d0, - iloop_d1 * thread_per_d1, - iloop_d2 * thread_per_d2 * DataPerRead); + SrcDesc{}.GetOffsetFromMultiIndex(iloop_d0 * thread_per_d0, + iloop_d1 * thread_per_d1, + iloop_d2 * thread_per_d2 * DataPerRead); const index_t dst_offset = - DstDesc{}.Get1dIndex(iloop_d0 * thread_per_d0, - iloop_d1 * thread_per_d1, - iloop_d2 * thread_per_d2 * DataPerRead); + DstDesc{}.GetOffsetFromMultiIndex(iloop_d0 * thread_per_d0, + iloop_d1 * thread_per_d1, + iloop_d2 * thread_per_d2 * DataPerRead); *(reinterpret_cast(&p_dst[dst_offset + mDstMyThreadOffset])) = *( reinterpret_cast(&p_src[src_offset + mSrcMyThreadOffset])); @@ -295,12 +298,12 @@ struct Blockwise3dTensorCopy3 for(index_t iloop_d2 = 0; iloop_d2 < nloop_d2; ++iloop_d2) { const index_t src_offset = - SrcDesc{}.Get1dIndex(iloop_d0 * thread_per_d0, - iloop_d1 * thread_per_d1, - iloop_d2 * thread_per_d2 * DataPerRead); + SrcDesc{}.GetOffsetFromMultiIndex(iloop_d0 * thread_per_d0, + iloop_d1 * thread_per_d1, + iloop_d2 * thread_per_d2 * DataPerRead); - const index_t clipboard_offset = - clipboard_desc.Get1dIndex(iloop_d0, iloop_d1, iloop_d2 * DataPerRead); + const index_t clipboard_offset = clipboard_desc.GetOffsetFromMultiIndex( + iloop_d0, iloop_d1, iloop_d2 * DataPerRead); *(reinterpret_cast(&p_clipboard[clipboard_offset])) = *( reinterpret_cast(&p_src[src_offset + mSrcMyThreadOffset])); @@ -350,13 +353,13 @@ struct Blockwise3dTensorCopy3 #pragma unroll for(index_t iloop_d2 = 0; iloop_d2 < nloop_d2; ++iloop_d2) { - const index_t clipboard_offset = - clipboard_desc.Get1dIndex(iloop_d0, iloop_d1, iloop_d2 * DataPerRead); + const index_t clipboard_offset = clipboard_desc.GetOffsetFromMultiIndex( + iloop_d0, iloop_d1, iloop_d2 * DataPerRead); const index_t dst_offset = - DstDesc{}.Get1dIndex(iloop_d0 * thread_per_d0, - iloop_d1 * thread_per_d1, - iloop_d2 * thread_per_d2 * DataPerRead); + DstDesc{}.GetOffsetFromMultiIndex(iloop_d0 * thread_per_d0, + iloop_d1 * thread_per_d1, + iloop_d2 * thread_per_d2 * DataPerRead); *(reinterpret_cast(&p_dst[dst_offset + mDstMyThreadOffset])) = *(reinterpret_cast(&p_clipboard[clipboard_offset])); diff --git a/src/include/blockwise_4d_tensor_op.hip.hpp b/src/include/blockwise_4d_tensor_op.hip.hpp index 17c05571a2..754a310afb 100644 --- a/src/include/blockwise_4d_tensor_op.hip.hpp +++ b/src/include/blockwise_4d_tensor_op.hip.hpp @@ -13,7 +13,7 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst constexpr auto dst_desc = DstDesc{}; - constexpr auto desc = make_ConstantTensorDescriptor(dst_desc.GetLengths()); + constexpr auto desc = make_packed_ConstantTensorDescriptor(dst_desc.GetLengths()); #if 0 if(get_thread_local_1d_id() == 0) @@ -43,7 +43,7 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst const index_t did3 = is / desc.GetStride(I3); - const index_t dindex = dst_desc.Get1dIndex(did0, did1, did2, did3); + const index_t dindex = dst_desc.GetOffsetFromMultiIndex(did0, did1, did2, did3); f(p_dst[dindex]); } @@ -70,7 +70,7 @@ blockwise_4d_tensor_pointwise_operation_unary(DstDesc, Float* __restrict__ p_dst const index_t did3 = is / desc.GetStride(I3); - const index_t dindex = dst_desc.Get1dIndex(did0, did1, did2, did3); + const index_t dindex = dst_desc.GetOffsetFromMultiIndex(did0, did1, did2, did3); f(p_dst[dindex]); } @@ -108,7 +108,7 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds constexpr auto src_desc = SrcDesc{}; constexpr auto dst_desc = DstDesc{}; - constexpr auto ref_desc = make_ConstantTensorDescriptor(SrcOpLengths{}); + constexpr auto ref_desc = make_packed_ConstantTensorDescriptor(SrcOpLengths{}); constexpr index_t NLoop = ref_desc.GetElementSize() / BlockSize; @@ -132,9 +132,10 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds did[3] = is / ref_desc.GetStride(I3); - const index_t src_index = src_desc.Get1dIndex(did[0], did[1], did[2], did[3]); + const index_t src_index = src_desc.GetOffsetFromMultiIndex(did[0], did[1], did[2], did[3]); - const index_t dst_index = dst_desc.Get1dIndex(did[IR0], did[IR1], did[IR2], did[IR3]); + const index_t dst_index = + dst_desc.GetOffsetFromMultiIndex(did[IR0], did[IR1], did[IR2], did[IR3]); f(p_src[src_index], p_dst[dst_index]); } @@ -163,9 +164,11 @@ __device__ void blockwise_4d_tensor_pointwise_operation_binary_reorder_by_get_ds did[3] = is / ref_desc.GetStride(I3); - const index_t src_index = src_desc.Get1dIndex(did[0], did[1], did[2], did[3]); + const index_t src_index = + src_desc.GetOffsetFromMultiIndex(did[0], did[1], did[2], did[3]); - const index_t dst_index = dst_desc.Get1dIndex(did[IR0], did[IR1], did[IR2], did[IR3]); + const index_t dst_index = + dst_desc.GetOffsetFromMultiIndex(did[IR0], did[IR1], did[IR2], did[IR3]); f(p_src[src_index], p_dst[dst_index]); } @@ -256,7 +259,7 @@ struct Blockwise4dTensorCopy1 constexpr index_t read_per_d3 = mod_conv::integer_divide_ceil(L3, DataPerRead); constexpr auto ref_desc = - make_ConstantTensorDescriptor(Sequence{}); + make_packed_ConstantTensorDescriptor(Sequence{}); constexpr index_t NLoop = ref_desc.GetElementSize() / BlockSize; @@ -278,9 +281,9 @@ struct Blockwise4dTensorCopy1 did[3] = is / ref_desc.GetStride(I3); const index_t src_index = - src_desc.Get1dIndex(did[0], did[1], did[2], did[3] * DataPerRead); + src_desc.GetOffsetFromMultiIndex(did[0], did[1], did[2], did[3] * DataPerRead); const index_t dst_index = - dst_desc.Get1dIndex(did[0], did[1], did[2], did[3] * DataPerRead); + dst_desc.GetOffsetFromMultiIndex(did[0], did[1], did[2], did[3] * DataPerRead); *(reinterpret_cast(p_dst + dst_index)) = *(reinterpret_cast(p_src + src_index)); @@ -333,19 +336,19 @@ struct BlockwiseChwnTensorCopyPadded constexpr auto src_desc = SrcDesc{}; constexpr auto dst_desc = DstDesc{}; - constexpr auto ref_desc = make_ConstantTensorDescriptor(DstOpLengths{}); + constexpr auto ref_desc = make_packed_ConstantTensorDescriptor(DstOpLengths{}); constexpr auto h_global_pad_low = GlobalLowerPads{}.Get(I0); constexpr auto w_global_pad_low = GlobalLowerPads{}.Get(I1); constexpr index_t NLoop = ref_desc.GetElementSize() / BlockSize; - const Float* p_src_tmp = - p_src + - src_desc.Get1dIndex(c_block_data_begin, - (ho_block_data_begin + h_block_pad_low) - h_global_pad_low, - (wo_block_data_begin + w_block_pad_low) - w_global_pad_low, - n_block_data_begin); + const Float* p_src_tmp = p_src + + src_desc.GetOffsetFromMultiIndex( + c_block_data_begin, + (ho_block_data_begin + h_block_pad_low) - h_global_pad_low, + (wo_block_data_begin + w_block_pad_low) - w_global_pad_low, + n_block_data_begin); #if 0 if(get_thread_local_1d_id() == 0) @@ -389,13 +392,13 @@ struct BlockwiseChwnTensorCopyPadded did[3] = is / ref_desc.GetStride(I3); - const index_t bindex = dst_desc.Get1dIndex(did[0], did[1], did[2], did[3]); + const index_t bindex = dst_desc.GetOffsetFromMultiIndex(did[0], did[1], did[2], did[3]); p_dst[bindex] = (did[1] < h_block_pad_low || did[1] + h_block_pad_up >= ref_desc.GetLength(I1) || did[2] < w_block_pad_low || did[2] + w_block_pad_up >= ref_desc.GetLength(I2)) ? Float(0) - : p_src_tmp[src_desc.Get1dIndex(did[0], did[1], did[2], did[3])]; + : p_src_tmp[src_desc.GetOffsetFromMultiIndex(did[0], did[1], did[2], did[3])]; } constexpr bool has_tail = (ref_desc.GetElementSize() > NLoop * BlockSize); @@ -422,14 +425,16 @@ struct BlockwiseChwnTensorCopyPadded did[3] = is / ref_desc.GetStride(I3); - const index_t bindex = dst_desc.Get1dIndex(did[0], did[1], did[2], did[3]); + const index_t bindex = + dst_desc.GetOffsetFromMultiIndex(did[0], did[1], did[2], did[3]); p_dst[bindex] = (did[1] < h_block_pad_low || did[1] + h_block_pad_up >= ref_desc.GetLength(I1) || did[2] < w_block_pad_low || did[2] + w_block_pad_up >= ref_desc.GetLength(I2)) ? Float(0) - : p_src_tmp[src_desc.Get1dIndex(did[0], did[1], did[2], did[3])]; + : p_src_tmp[src_desc.GetOffsetFromMultiIndex( + did[0], did[1], did[2], did[3])]; } } } @@ -505,18 +510,19 @@ struct Blockwise4dTensorCopy3 } } - constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor(ThreadPerDims{}); - const auto thread_multi_id = thread_cluster_desc.GetMultiIndex(get_thread_local_1d_id()); + constexpr auto thread_cluster_desc = make_packed_ConstantTensorDescriptor(ThreadPerDims{}); + const auto thread_multi_id = + thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id()); - mSrcMyThreadOffset = SrcDesc{}.Get1dIndex(thread_multi_id[0], - thread_multi_id[1], - thread_multi_id[2], - thread_multi_id[3] * DataPerRead); + mSrcMyThreadOffset = SrcDesc{}.GetOffsetFromMultiIndex(thread_multi_id[0], + thread_multi_id[1], + thread_multi_id[2], + thread_multi_id[3] * DataPerRead); - mDstMyThreadOffset = DstDesc{}.Get1dIndex(thread_multi_id[0], - thread_multi_id[1], - thread_multi_id[2], - thread_multi_id[3] * DataPerRead); + mDstMyThreadOffset = DstDesc{}.GetOffsetFromMultiIndex(thread_multi_id[0], + thread_multi_id[1], + thread_multi_id[2], + thread_multi_id[3] * DataPerRead); } __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const @@ -564,17 +570,17 @@ struct Blockwise4dTensorCopy3 #pragma unroll for(index_t iloop_d3 = 0; iloop_d3 < nloop_d3; ++iloop_d3) { - const index_t src_offset = - SrcDesc{}.Get1dIndex(iloop_d0 * thread_per_d0, - iloop_d1 * thread_per_d1, - iloop_d2 * thread_per_d2, - iloop_d3 * thread_per_d3 * DataPerRead); + const index_t src_offset = SrcDesc{}.GetOffsetFromMultiIndex( + iloop_d0 * thread_per_d0, + iloop_d1 * thread_per_d1, + iloop_d2 * thread_per_d2, + iloop_d3 * thread_per_d3 * DataPerRead); - const index_t dst_offset = - DstDesc{}.Get1dIndex(iloop_d0 * thread_per_d0, - iloop_d1 * thread_per_d1, - iloop_d2 * thread_per_d2, - iloop_d3 * thread_per_d3 * DataPerRead); + const index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex( + iloop_d0 * thread_per_d0, + iloop_d1 * thread_per_d1, + iloop_d2 * thread_per_d2, + iloop_d3 * thread_per_d3 * DataPerRead); *(reinterpret_cast(&p_dst[dst_offset + mDstMyThreadOffset])) = *(reinterpret_cast( @@ -646,7 +652,7 @@ struct Blockwise4dTensorCopy3 constexpr index_t nloop_d2 = L2 / thread_per_d2; constexpr index_t nloop_d3 = mod_conv::integer_divide_ceil(L3, thread_per_d3 * DataPerRead); - constexpr auto clipboard_desc = make_ConstantTensorDescriptor( + constexpr auto clipboard_desc = make_packed_ConstantTensorDescriptor( Sequence{}); #pragma unroll @@ -661,13 +667,13 @@ struct Blockwise4dTensorCopy3 #pragma unroll for(index_t iloop_d3 = 0; iloop_d3 < nloop_d3; ++iloop_d3) { - const index_t src_offset = - SrcDesc{}.Get1dIndex(iloop_d0 * thread_per_d0, - iloop_d1 * thread_per_d1, - iloop_d2 * thread_per_d2, - iloop_d3 * thread_per_d3 * DataPerRead); + const index_t src_offset = SrcDesc{}.GetOffsetFromMultiIndex( + iloop_d0 * thread_per_d0, + iloop_d1 * thread_per_d1, + iloop_d2 * thread_per_d2, + iloop_d3 * thread_per_d3 * DataPerRead); - const index_t clipboard_offset = clipboard_desc.Get1dIndex( + const index_t clipboard_offset = clipboard_desc.GetOffsetFromMultiIndex( iloop_d0, iloop_d1, iloop_d2, iloop_d3 * DataPerRead); *(reinterpret_cast(&p_clipboard[clipboard_offset])) = @@ -713,7 +719,7 @@ struct Blockwise4dTensorCopy3 constexpr index_t nloop_d2 = L2 / thread_per_d2; constexpr index_t nloop_d3 = mod_conv::integer_divide_ceil(L3, thread_per_d3 * DataPerRead); - constexpr auto clipboard_desc = make_ConstantTensorDescriptor( + constexpr auto clipboard_desc = make_packed_ConstantTensorDescriptor( Sequence{}); #pragma unroll @@ -728,14 +734,14 @@ struct Blockwise4dTensorCopy3 #pragma unroll for(index_t iloop_d3 = 0; iloop_d3 < nloop_d3; ++iloop_d3) { - const index_t clipboard_offset = clipboard_desc.Get1dIndex( + const index_t clipboard_offset = clipboard_desc.GetOffsetFromMultiIndex( iloop_d0, iloop_d1, iloop_d2, iloop_d3 * DataPerRead); - const index_t dst_offset = - DstDesc{}.Get1dIndex(iloop_d0 * thread_per_d0, - iloop_d1 * thread_per_d1, - iloop_d2 * thread_per_d2, - iloop_d3 * thread_per_d3 * DataPerRead); + const index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex( + iloop_d0 * thread_per_d0, + iloop_d1 * thread_per_d1, + iloop_d2 * thread_per_d2, + iloop_d3 * thread_per_d3 * DataPerRead); *(reinterpret_cast(&p_dst[dst_offset + mDstMyThreadOffset])) = *(reinterpret_cast(&p_clipboard[clipboard_offset])); diff --git a/src/include/blockwise_batched_gemm.hip.hpp b/src/include/blockwise_batched_gemm.hip.hpp index 98f36011f3..6e397d1efa 100644 --- a/src/include/blockwise_batched_gemm.hip.hpp +++ b/src/include/blockwise_batched_gemm.hip.hpp @@ -87,10 +87,10 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 const auto c_thread_mtx_index = GetBeginOfThreadMatrixC(get_thread_local_1d_id()); mMyThreadOffsetA = c_thread_mtx_index.batch * BlockMatrixStrideA + - a_block_mtx.Get1dIndex(0, c_thread_mtx_index.row); + a_block_mtx.GetOffsetFromMultiIndex(0, c_thread_mtx_index.row); mMyThreadOffsetB = c_thread_mtx_index.batch * BlockMatrixStrideB + - b_block_mtx.Get1dIndex(0, c_thread_mtx_index.col); + b_block_mtx.GetOffsetFromMultiIndex(0, c_thread_mtx_index.col); #if 0 if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) @@ -221,10 +221,12 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 threadwise_matrix_copy( a_block_mtx, p_a_block + - a_block_mtx.Get1dIndex(k_begin, m_repeat * MPerLevel1Cluster) + + a_block_mtx.GetOffsetFromMultiIndex(k_begin, + m_repeat * MPerLevel1Cluster) + ib * BlockMatrixStrideA + mMyThreadOffsetA, a_thread_mtx, - p_a_thread + a_thread_mtx.Get1dIndex(0, m_repeat * MPerThreadSubC), + p_a_thread + + a_thread_mtx.GetOffsetFromMultiIndex(0, m_repeat * MPerThreadSubC), a_thread_sub_mtx.GetLengths(), Number{}); } @@ -238,10 +240,12 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 threadwise_matrix_copy( b_block_mtx, p_b_block + - b_block_mtx.Get1dIndex(k_begin, n_repeat * NPerLevel1Cluster) + + b_block_mtx.GetOffsetFromMultiIndex(k_begin, + n_repeat * NPerLevel1Cluster) + ib * BlockMatrixStrideB + mMyThreadOffsetB, b_thread_mtx, - p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC), + p_b_thread + + b_thread_mtx.GetOffsetFromMultiIndex(0, n_repeat * NPerThreadSubC), b_thread_sub_mtx.GetLengths(), Number{}); } @@ -343,9 +347,11 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 reg_a[0] = *reinterpret_cast(&p_a_block[mMyThreadOffsetA]); reg_b[0] = *reinterpret_cast(&p_b_block[mMyThreadOffsetB]); reg_b[1] = *reinterpret_cast( - &p_b_block[b_block_mtx.Get1dIndex(0, NPerLevel1Cluster) + mMyThreadOffsetB]); + &p_b_block[b_block_mtx.GetOffsetFromMultiIndex(0, NPerLevel1Cluster) + + mMyThreadOffsetB]); reg_a[1] = *reinterpret_cast( - &p_a_block[a_block_mtx.Get1dIndex(0, MPerLevel1Cluster) + mMyThreadOffsetA]); + &p_a_block[a_block_mtx.GetOffsetFromMultiIndex(0, MPerLevel1Cluster) + + mMyThreadOffsetA]); outerProduct4x4(reg_a[0], reg_b[0], reg_c[0], reg_c[2], reg_c[4], reg_c[6]); outerProduct4x4(reg_a[0], reg_b[1], reg_c[1], reg_c[3], reg_c[5], reg_c[7]); @@ -353,15 +359,17 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 for(index_t k = 1; k < K; ++k) { reg_a[0] = *reinterpret_cast( - &p_a_block[a_block_mtx.Get1dIndex(k, 0) + mMyThreadOffsetA]); + &p_a_block[a_block_mtx.GetOffsetFromMultiIndex(k, 0) + mMyThreadOffsetA]); outerProduct4x4(reg_a[1], reg_b[0], reg_c[8], reg_c[10], reg_c[12], reg_c[14]); reg_b[0] = *reinterpret_cast( - &p_b_block[b_block_mtx.Get1dIndex(k, 0) + mMyThreadOffsetB]); + &p_b_block[b_block_mtx.GetOffsetFromMultiIndex(k, 0) + mMyThreadOffsetB]); outerProduct4x4(reg_a[1], reg_b[1], reg_c[9], reg_c[11], reg_c[13], reg_c[15]); reg_b[1] = *reinterpret_cast( - &p_b_block[b_block_mtx.Get1dIndex(k, NPerLevel1Cluster) + mMyThreadOffsetB]); + &p_b_block[b_block_mtx.GetOffsetFromMultiIndex(k, NPerLevel1Cluster) + + mMyThreadOffsetB]); reg_a[1] = *reinterpret_cast( - &p_a_block[a_block_mtx.Get1dIndex(k, MPerLevel1Cluster) + mMyThreadOffsetA]); + &p_a_block[a_block_mtx.GetOffsetFromMultiIndex(k, MPerLevel1Cluster) + + mMyThreadOffsetA]); outerProduct4x4(reg_a[0], reg_b[0], reg_c[0], reg_c[2], reg_c[4], reg_c[6]); outerProduct4x4(reg_a[0], reg_b[1], reg_c[1], reg_c[3], reg_c[5], reg_c[7]); } @@ -489,7 +497,7 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 const index_t c_thread_offset = c_thread_mtx_begin.batch * BlockMatrixStrideC + - c_block_mtx.Get1dIndex(c_thread_mtx_begin.row, c_thread_mtx_begin.col); + c_block_mtx.GetOffsetFromMultiIndex(c_thread_mtx_begin.row, c_thread_mtx_begin.col); for(index_t m_repeat = 0; m_repeat < MRepeat; ++m_repeat) { @@ -498,12 +506,12 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 threadwise_matrix_copy( c_thread_sub_mtx, p_c_thread + - c_thread_sub_mtx.Get1dIndex(m_repeat * MPerLevel1Cluster, - n_repeat * NPerLevel1Cluster), + c_thread_sub_mtx.GetOffsetFromMultiIndex(m_repeat * MPerLevel1Cluster, + n_repeat * NPerLevel1Cluster), c_block_mtx, p_c_block + - c_block_mtx.Get1dIndex(m_repeat * MPerLevel1Cluster, - n_repeat * NPerLevel1Cluster) + + c_block_mtx.GetOffsetFromMultiIndex(m_repeat * MPerLevel1Cluster, + n_repeat * NPerLevel1Cluster) + c_thread_offset, c_thread_sub_mtx.GetLengths()); } diff --git a/src/include/blockwise_gemm.hip.hpp b/src/include/blockwise_gemm.hip.hpp index 3159eb5ae2..fd51d86e15 100644 --- a/src/include/blockwise_gemm.hip.hpp +++ b/src/include/blockwise_gemm.hip.hpp @@ -51,8 +51,8 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 auto c_thread_mtx_index = GetBeginOfThreadMatrixC(get_thread_local_1d_id()); - mMyThreadOffsetA = BlockMatrixA::Get1dIndex(0, c_thread_mtx_index.row); - mMyThreadOffsetB = BlockMatrixB::Get1dIndex(0, c_thread_mtx_index.col); + mMyThreadOffsetA = BlockMatrixA::GetOffsetFromMultiIndex(0, c_thread_mtx_index.row); + mMyThreadOffsetB = BlockMatrixB::GetOffsetFromMultiIndex(0, c_thread_mtx_index.col); } __device__ static auto GetThreadMatrixCLengths() @@ -248,10 +248,11 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 { threadwise_matrix_copy( a_block_mtx, - p_a_block + a_block_mtx.Get1dIndex(k_begin, m_repeat * MPerLevel1Cluster) + + p_a_block + + a_block_mtx.GetOffsetFromMultiIndex(k_begin, m_repeat * MPerLevel1Cluster) + mMyThreadOffsetA, a_thread_mtx, - p_a_thread + a_thread_mtx.Get1dIndex(0, m_repeat * MPerThreadSubC), + p_a_thread + a_thread_mtx.GetOffsetFromMultiIndex(0, m_repeat * MPerThreadSubC), a_thread_sub_mtx.GetLengths(), Number{}); } @@ -262,10 +263,11 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 { threadwise_matrix_copy( b_block_mtx, - p_b_block + b_block_mtx.Get1dIndex(k_begin, n_repeat * NPerLevel1Cluster) + + p_b_block + + b_block_mtx.GetOffsetFromMultiIndex(k_begin, n_repeat * NPerLevel1Cluster) + mMyThreadOffsetB, b_thread_mtx, - p_b_thread + b_thread_mtx.Get1dIndex(0, n_repeat * NPerThreadSubC), + p_b_thread + b_thread_mtx.GetOffsetFromMultiIndex(0, n_repeat * NPerThreadSubC), b_thread_sub_mtx.GetLengths(), Number{}); } diff --git a/src/include/blockwise_merged_tensor_slice_op.hip.hpp b/src/include/blockwise_merged_tensor_slice_op.hip.hpp index b67a239455..b47df1f602 100644 --- a/src/include/blockwise_merged_tensor_slice_op.hip.hpp +++ b/src/include/blockwise_merged_tensor_slice_op.hip.hpp @@ -11,7 +11,7 @@ template struct BlockwiseTensorSliceCopy_generic_v1 @@ -21,35 +21,142 @@ struct BlockwiseTensorSliceCopy_generic_v1 index_t mSrcMyThreadOffset; index_t mDstMyThreadOffset; - __device__ BlockwiseTensorSliceCopy_generic_v1(Array src_block_multi_id_offset, - Array dst_block_multi_id_offset) + __device__ BlockwiseTensorSliceCopy_generic_v1(Array src_block_multi_offset, + Array dst_block_multi_offset) { - // only support SrcSubLengths.GetLength() == 1 on merged dimension, for now - // check SrcDataPerRead should be 1, if last dimension is a merged dimension - // check NDim consistent + static_assert(SrcDesc::GetNumOfDimension() == DstDesc::GetNumOfDimension(), "wrong"); - // calculate mSrcMyThreadOffset - // calculate mDstMyThreadOffset + constexpr auto thread_cluster_desc = make_packed_ConstantTensorDescriptor( + ClusterLengths{}.ReorderGivenNew2Old(ThreadClusterArrangeOrder{})); + + // BlockSize + static_assert(BlockSize == thread_cluster_desc.GetElementSize(), "wrong! BlockSize"); + + // divide work + static_for<0, nDim, 1>{}([&](auto IDim) { + static_assert(SliceLengths{}.Get(IDim) % SubLenghs{}.Get(IDim) == 0, + "wrong! cannot evenly divide sliced tensor into sub-tensor"); + }); + + constexpr auto thread_work_desc = + make_packed_ConstantTensorDescriptor(SliceLengths{} / SliceSubLengths{}); + + static_for<0, nDim, 1>{}([&](auto IDim) { + static_assert(thread_work_desc.GetLength(IDim) % thread_cluster_desc.Get(IDim) == 0, + "wrong! cannot evenly divide work to cluster"); + }); + + // only support SubLengths.Get() == 1 on merged dimension, for now + static_for<0, nDim, 1>{}([&](auto IDim) { + static_if<(SrcDesc::ContainMultipleOriginalDimensions(IDim) || + DstDesc::ContainMultipleOriginalDimensions(IDim))>{}([&](auto fwd) { + static_assert(fwd(SubLengths{}).Get(IDim) == 1, + "wrong! Sub-Lengths on merged dimension should be 1"); + }); + }); + + // calculate mSrcMyThreadOffset, mDstMyThreadOffset + const auto thread_cluster_multi_id = + thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id()); + + const auto data_cluster_multi_id = + reorder_array_given_old2new(thread_cluster_multi_id, ThreadClusterArrangeOrder{}); + + const auto thread_data_multi_offset = data_cluster_multi_id * SubLengths{}; + + mSrcMythreadOffset = + SrcDesc::GetOffsetFromMultiIndex(src_block_multi_offset + thread_data_multi_offset); + mSrcMythreadOffset = + DstDesc::GetOffsetFromMultiIndex(dst_block_multi_offset + thread_data_multi_offset); } - __device__ static constexpr index_t GetRegisterClipboardSize() {} + __device__ static constexpr index_t GetRegisterClipboardSize() + { + constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ClusterLengths{}); + + constexpr auto thread_tensor_desc = + make_packed_ConstantTensorDescriptor(SubLengths{} * repeat_lengths); + + return thread_tensor_desc.GetElementSpaceSize(); + } __device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src, Float* __restrict__ p_clipboard) const { + constexpr auto thread_sub_tensor_lengths = SubLengths{}; + + constexpr auto data_per_cluster_per_dims = thread_sub_tensor_lengths * ClusterLengths{}; + + constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ClusterLengths{}); + + constexpr auto thread_tensor_desc = + make_packed_ConstantTensorDescriptor(thread_sub_tensor_lengths * repeat_lengths); + + static_ford{}([&](auto repeat_multi_id_) { + constexpr auto repeat_multi_id = decltype(repeat_multi_id_){}; + + constexpr auto src_data_multi_offset = repeat_multi_id * data_per_cluster_per_dims; + + constexpr auto clipboard_data_multi_offset = + repeat_multi_id * thread_sub_tensor_lengths; + + constexpr index_t src_offset = SrcDesc{}.GetOffsetFromMultiIndex(src_data_multi_id); + constexpr index_t clipboard_offset = + thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id); + + threadwise_tensor_slice_copy_generic(SrcDesc{}, + p_src + src_offset + mSrcMyThreadOffset, + thread_tensor_desc, + zero_array{}, + thread_tensor_desc, + p_clipboard + clipboard_offset, + zero_array{}, + thread_sub_tensor_lengths, + SrcAccessOrder{}); + }); } __device__ void RunStoreRegisterClipboard(const Float* __restrict__ p_clipboard, Float* __restrict__ p_dst) const { + constexpr auto thread_sub_tensor_lengths = SubLengths{}; + + constexpr auto data_per_cluster_per_dims = thread_sub_tensor_lengths * ClusterLengths{}; + + constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ClusterLengths{}); + + constexpr auto thread_tensor_desc = + make_packed_ConstantTensorDescriptor(thread_sub_tensor_lengths * repeat_lengths); + + static_ford{}([&](auto repeat_multi_id_) { + constexpr auto repeat_multi_id = decltype(repeat_multi_id_){}; + + constexpr auto clipboard_data_multi_offset = + repeat_multi_id * thread_sub_tensor_lengths; + + constexpr auto dst_data_multi_offset = repeat_multi_id * data_per_cluster_per_dims; + + constexpr index_t clipboard_offset = + thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_offset); + + constexpr index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex(dst_data_multi_offset); + + threadwise_tensor_slice_copy_generic(thread_tensor_desc, + p_clipboard + clipboard_offset, + zero_array{}, + DstDesc{}, + p_dst + dst_offset + mDstMyThreadOffset, + zero_array{}, + thread_sub_tensor_lengths, + DstAccessOrder{}); } __device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const { - Float p_clipboard[GetRegisterClipboardSize()]; + Float p_clipboard[GetRegisterClipboardSize()]; - RunLoadRegisterClipboard(p_src, p_clipboard); - RunStoreRegisterClipboard(p_clipboard, p_dst); + RunLoadRegisterClipboard(p_src, p_clipboard); + RunStoreRegisterClipboard(p_clipboard, p_dst); } -}; + }; diff --git a/src/include/blockwise_tensor_slice_op.hip.hpp b/src/include/blockwise_tensor_slice_op.hip.hpp index 22c01250ce..6a32754a28 100644 --- a/src/include/blockwise_tensor_slice_op.hip.hpp +++ b/src/include/blockwise_tensor_slice_op.hip.hpp @@ -39,7 +39,7 @@ struct BlockwiseTensorSliceReorderCopy_v3 constexpr auto thread_cluster_lengths = src_cluster_lengths.ReorderGivenNew2Old(map_thread_cluster_2_src_cluster); - constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor(thread_cluster_lengths); + constexpr auto thread_cluster_desc = make_packed_ConstantTensorDescriptor(thread_cluster_lengths); // sanity check: data type static_assert(is_same::value, "wrong! only support float for now!\n"); @@ -105,7 +105,8 @@ struct BlockwiseTensorSliceReorderCopy_v3 } } - const auto thread_multi_id = thread_cluster_desc.GetMultiIndex(get_thread_local_1d_id()); + const auto thread_multi_id = + thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id()); // compiler: thread_multi_id, src_data_multi_id, dst_data_multi_id, will use separate // regsiters, or only one copy??? @@ -115,17 +116,21 @@ struct BlockwiseTensorSliceReorderCopy_v3 static_for<0, nDim, 1>{}([&](auto IDim) { constexpr auto I = decltype(IDim){}; constexpr index_t i = I.Get(); - // compiler: will it really compute index here, or be merged with Get1dIndex and + // compiler: will it really compute index here, or be merged with + // GetOffsetFromMultiIndex and // optimized away??? src_data_multi_id[i] *= src_sub_lengths.Get(I); }); - // compiler: will it really compute index here, or be merged with Get1dIndex and + // compiler: will it really compute index here, or be merged with GetOffsetFromMultiIndex + // and // optimized away??? const auto dst_data_multi_id = reorder_array_given_new2old(src_data_multi_id, map_dst2src); - mSrcMyThreadOffset = src_desc.Get1dIndex(src_data_multi_id + src_block_data_multi_id_begin); - mDstMyThreadOffset = dst_desc.Get1dIndex(dst_data_multi_id + dst_block_data_multi_id_begin); + mSrcMyThreadOffset = + src_desc.GetOffsetFromMultiIndex(src_data_multi_id + src_block_data_multi_id_begin); + mDstMyThreadOffset = + dst_desc.GetOffsetFromMultiIndex(dst_data_multi_id + dst_block_data_multi_id_begin); } __device__ static constexpr index_t GetRegisterClipboardSize() @@ -142,7 +147,7 @@ struct BlockwiseTensorSliceReorderCopy_v3 constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths; - constexpr auto thread_tensor_desc = make_ConstantTensorDescriptor(thread_tensor_lengths); + constexpr auto thread_tensor_desc = make_packed_ConstantTensorDescriptor(thread_tensor_lengths); return thread_tensor_desc.GetElementSpace(); } @@ -162,7 +167,7 @@ struct BlockwiseTensorSliceReorderCopy_v3 constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths; - constexpr auto thread_tensor_desc = make_ConstantTensorDescriptor(thread_tensor_lengths); + constexpr auto thread_tensor_desc = make_packed_ConstantTensorDescriptor(thread_tensor_lengths); static_ford{}([&](auto repeat_multi_id_) { constexpr auto repeat_multi_id = decltype(repeat_multi_id_){}; @@ -171,9 +176,9 @@ struct BlockwiseTensorSliceReorderCopy_v3 constexpr auto clipboard_data_multi_id = repeat_multi_id * thread_sub_tensor_lengths; - constexpr index_t src_offset = SrcDesc{}.Get1dIndex(src_data_multi_id); + constexpr index_t src_offset = SrcDesc{}.GetOffsetFromMultiIndex(src_data_multi_id); constexpr index_t clipboard_offset = - thread_tensor_desc.Get1dIndex(clipboard_data_multi_id); + thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id); threadwise_tensor_slice_copy(SrcDesc{}, p_src + src_offset + mSrcMyThreadOffset, @@ -199,7 +204,7 @@ struct BlockwiseTensorSliceReorderCopy_v3 constexpr auto thread_tensor_lengths = thread_sub_tensor_lengths * repeat_lengths; - constexpr auto thread_tensor_desc = make_ConstantTensorDescriptor(thread_tensor_lengths); + constexpr auto thread_tensor_desc = make_packed_ConstantTensorDescriptor(thread_tensor_lengths); static_ford{}([&](auto repeat_multi_id_) { constexpr auto repeat_multi_id = decltype(repeat_multi_id_){}; @@ -212,9 +217,9 @@ struct BlockwiseTensorSliceReorderCopy_v3 constexpr auto dst_data_multi_id = src_data_multi_id.ReorderGivenNew2Old(MapDst2Src{}); constexpr index_t clipboard_offset = - thread_tensor_desc.Get1dIndex(clipboard_data_multi_id); + thread_tensor_desc.GetOffsetFromMultiIndex(clipboard_data_multi_id); - constexpr index_t dst_offset = DstDesc{}.Get1dIndex(dst_data_multi_id); + constexpr index_t dst_offset = DstDesc{}.GetOffsetFromMultiIndex(dst_data_multi_id); // write in the order of dst #if 1 diff --git a/src/include/conv_common.hip.hpp b/src/include/conv_common.hip.hpp index d9bf22b9c8..e56743f242 100644 --- a/src/include/conv_common.hip.hpp +++ b/src/include/conv_common.hip.hpp @@ -30,7 +30,7 @@ __host__ __device__ constexpr auto get_convolution_output_default_4d_tensor_desc constexpr auto HO = HI + 1 - Y; constexpr auto WO = WI + 1 - X; - return make_ConstantTensorDescriptor(Sequence{}); + return make_packed_ConstantTensorDescriptor(Sequence{}); } template @@ -67,7 +67,7 @@ __host__ __device__ constexpr auto get_convolution_with_padding_output_default_4 constexpr auto HO = HI + HPadLow + HPadUp + 1 - Y; constexpr auto WO = WI + WPadLow + WPadUp + 1 - X; - return make_ConstantTensorDescriptor(Sequence{}); + return make_packed_ConstantTensorDescriptor(Sequence{}); } template diff --git a/src/include/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hip.hpp b/src/include/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hip.hpp index 3590e63f1e..c169d7feb0 100644 --- a/src/include/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hip.hpp +++ b/src/include/gridwise_convolution_direct_v2_nchw_kcyx_nkhw.hip.hpp @@ -180,18 +180,19 @@ struct GridwiseConvolutionDirect_v2_nchw_kcyx_nkhw c_block_data_begin += CPerBlock, __syncthreads()) { // copy input tensor to LDS - blockwise_in_copy.Run(p_in_global + - in_nchw_global_desc.Get1dIndex(n_block_data_begin, - c_block_data_begin, - hi_block_data_begin, - wi_block_data_begin), - p_in_block); + blockwise_in_copy.Run( + p_in_global + + in_nchw_global_desc.GetOffsetFromMultiIndex(n_block_data_begin, + c_block_data_begin, + hi_block_data_begin, + wi_block_data_begin), + p_in_block); // copy weight tensor to LDS - blockwise_wei_copy.Run( - p_wei_global + - wei_kcyx_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0), - p_wei_block); + blockwise_wei_copy.Run(p_wei_global + + wei_kcyx_global_desc.GetOffsetFromMultiIndex( + k_block_data_begin, c_block_data_begin, 0, 0), + p_wei_block); __syncthreads(); @@ -202,26 +203,28 @@ struct GridwiseConvolutionDirect_v2_nchw_kcyx_nkhw threadwise_direct_convolution_2( in_nchw_thread_block_desc, p_in_block + - in_nchw_block_desc.Get1dIndex(n_thread_data_begin, - c_thread_data, - hi_thread_data_begin, - wi_thread_data_begin), + in_nchw_block_desc.GetOffsetFromMultiIndex(n_thread_data_begin, + c_thread_data, + hi_thread_data_begin, + wi_thread_data_begin), wei_kcyx_thread_block_desc, p_wei_block + - wei_kcyx_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), + wei_kcyx_block_desc.GetOffsetFromMultiIndex( + k_thread_data_begin, c_thread_data, 0, 0), out_nkhw_thread_desc, p_out_thread); #elif 0 threadwise_direct_convolution_3( in_nchw_thread_block_desc, p_in_block + - in_nchw_block_desc.Get1dIndex(n_thread_data_begin, - c_thread_data, - hi_thread_data_begin, - wi_thread_data_begin), + in_nchw_block_desc.GetOffsetFromMultiIndex(n_thread_data_begin, + c_thread_data, + hi_thread_data_begin, + wi_thread_data_begin), wei_kcyx_thread_block_desc, p_wei_block + - wei_kcyx_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), + wei_kcyx_block_desc.GetOffsetFromMultiIndex( + k_thread_data_begin, c_thread_data, 0, 0), out_nkhw_thread_desc, p_out_thread); #endif @@ -229,16 +232,16 @@ struct GridwiseConvolutionDirect_v2_nchw_kcyx_nkhw } // copy output tensor from register to global mem - threadwise_tensor_slice_copy( - out_nkhw_thread_desc, - p_out_thread, - out_nkhw_global_desc, - p_out_global + - out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin, - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin), - out_nkhw_thread_desc.GetLengths(), - Number<1>{}); + threadwise_tensor_slice_copy(out_nkhw_thread_desc, + p_out_thread, + out_nkhw_global_desc, + p_out_global + + out_nkhw_global_desc.GetOffsetFromMultiIndex( + n_block_data_begin + n_thread_data_begin, + k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin), + out_nkhw_thread_desc.GetLengths(), + Number<1>{}); } }; diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hip.hpp index b5d8632d34..590f548b58 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r1_chwn_cyxk_khwn.hip.hpp @@ -221,11 +221,12 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn const Float* p_in_global_block_offset = p_in_global + - in_c_h_w_n_global_desc.Get1dIndex( + in_c_h_w_n_global_desc.GetOffsetFromMultiIndex( 0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin); const Float* p_wei_global_block_offset = - p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin); + p_wei_global + + wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, 0, 0, k_block_data_begin); for(index_t c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock, p_in_global_block_offset += CPerBlock * in_c_h_w_n_global_desc.GetStride(I0), @@ -261,8 +262,8 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn #else blockwise_batch_gemm.Run_asm #endif - (p_wei_block + wei_c_y_x_k_block_desc.Get1dIndex(0, y, x, 0), - p_in_block + in_c_h_w_n_block_desc.Get1dIndex(0, y, x, 0), + (p_wei_block + wei_c_y_x_k_block_desc.GetOffsetFromMultiIndex(0, y, x, 0), + p_in_block + in_c_h_w_n_block_desc.GetOffsetFromMultiIndex(0, y, x, 0), p_out_thread); } } @@ -325,17 +326,17 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn } #endif - threadwise_tensor_slice_copy( - out_10d_thread_desc, - p_out_thread, - out_10d_global_desc, - p_out_global + - out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin, - n_block_data_begin + n_thread_data_begin), - out_10d_thread_desc.GetLengths(), - Number{}); + threadwise_tensor_slice_copy(out_10d_thread_desc, + p_out_thread, + out_10d_global_desc, + p_out_global + + out_k_h_w_n_global_desc.GetOffsetFromMultiIndex( + k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin, + n_block_data_begin + n_thread_data_begin), + out_10d_thread_desc.GetLengths(), + Number{}); }).else_([&](auto f_dummy) { static_assert(f_dummy(GemmNPerThreadSubC) >= NPerBlock && NPerThread == NPerBlock && GemmNPerThreadSubC % NPerThread == 0, @@ -375,17 +376,17 @@ struct GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn } #endif - threadwise_tensor_slice_copy( - out_10d_thread_desc, - p_out_thread, - out_10d_global_desc, - p_out_global + - out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin, - n_block_data_begin + n_thread_data_begin), - out_10d_thread_desc.GetLengths(), - Number{}); + threadwise_tensor_slice_copy(out_10d_thread_desc, + p_out_thread, + out_10d_global_desc, + p_out_global + + out_k_h_w_n_global_desc.GetOffsetFromMultiIndex( + k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin, + n_block_data_begin + n_thread_data_begin), + out_10d_thread_desc.GetLengths(), + Number{}); }); } }; diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hip.hpp index 5e18c6ef99..2a6995929c 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r2_chwn_cyxk_khwn.hip.hpp @@ -230,11 +230,12 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn #if 1 const Float* p_in_global_block_offset = p_in_global + - in_c_h_w_n_global_desc.Get1dIndex( + in_c_h_w_n_global_desc.GetOffsetFromMultiIndex( 0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin); const Float* p_wei_global_block_offset = - p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin); + p_wei_global + + wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, 0, 0, k_block_data_begin); for(index_t c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock, p_in_global_block_offset += CPerBlock * in_c_h_w_n_global_desc.GetStride(I0), @@ -242,22 +243,24 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn { for(index_t y = 0; y < Y; ++y) { - blockwise_in_copy.Run(p_in_global_block_offset + - in_c_h_w_n_global_desc.Get1dIndex(0, y, 0, 0), - p_in_block); + blockwise_in_copy.Run( + p_in_global_block_offset + + in_c_h_w_n_global_desc.GetOffsetFromMultiIndex(0, y, 0, 0), + p_in_block); - blockwise_wei_copy.Run(p_wei_global_block_offset + - wei_c_y_x_k_global_desc.Get1dIndex(0, y, 0, 0), - p_wei_block); + blockwise_wei_copy.Run( + p_wei_global_block_offset + + wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, 0, 0), + p_wei_block); __syncthreads(); for(index_t x = 0; x < X; ++x) { - blockwise_batch_gemm.Run(p_wei_block + wei_c_x_k_block_desc.Get1dIndex(0, x, 0), - p_in_block + - in_c_h_w_n_block_desc.Get1dIndex(0, 0, x, 0), - p_out_thread); + blockwise_batch_gemm.Run( + p_wei_block + wei_c_x_k_block_desc.GetOffsetFromMultiIndex(0, x, 0), + p_in_block + in_c_h_w_n_block_desc.GetOffsetFromMultiIndex(0, 0, x, 0), + p_out_thread); } __syncthreads(); @@ -269,11 +272,12 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn { const Float* p_in_global_block_offset = p_in_global + - in_c_h_w_n_global_desc.Get1dIndex( + in_c_h_w_n_global_desc.GetOffsetFromMultiIndex( 0, hi_block_data_begin + y, wi_block_data_begin, n_block_data_begin); const Float* p_wei_global_block_offset = - p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, y, 0, k_block_data_begin); + p_wei_global + + wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, 0, k_block_data_begin); for(index_t c_block_data_begin = 0; @@ -290,10 +294,10 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn for(index_t x = 0; x < X; ++x) { - blockwise_batch_gemm.Run(p_wei_block + wei_c_x_k_block_desc.Get1dIndex(0, x, 0), - p_in_block + - in_c_h_w_n_block_desc.Get1dIndex(0, 0, x, 0), - p_out_thread); + blockwise_batch_gemm.Run( + p_wei_block + wei_c_x_k_block_desc.GetOffsetFromMultiIndex(0, x, 0), + p_in_block + in_c_h_w_n_block_desc.GetOffsetFromMultiIndex(0, 0, x, 0), + p_out_thread); } __syncthreads(); @@ -358,17 +362,17 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn } #endif - threadwise_tensor_slice_copy( - out_10d_thread_desc, - p_out_thread, - out_10d_global_desc, - p_out_global + - out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin, - n_block_data_begin + n_thread_data_begin), - out_10d_thread_desc.GetLengths(), - Number{}); + threadwise_tensor_slice_copy(out_10d_thread_desc, + p_out_thread, + out_10d_global_desc, + p_out_global + + out_k_h_w_n_global_desc.GetOffsetFromMultiIndex( + k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin, + n_block_data_begin + n_thread_data_begin), + out_10d_thread_desc.GetLengths(), + Number{}); }).else_([&](auto f_dummy) { static_assert(f_dummy(GemmNPerThreadSubC) >= NPerBlock && NPerThread == NPerBlock && GemmNPerThreadSubC % NPerThread == 0, @@ -408,17 +412,17 @@ struct GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn } #endif - threadwise_tensor_slice_copy( - out_10d_thread_desc, - p_out_thread, - out_10d_global_desc, - p_out_global + - out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin, - n_block_data_begin + n_thread_data_begin), - out_10d_thread_desc.GetLengths(), - Number{}); + threadwise_tensor_slice_copy(out_10d_thread_desc, + p_out_thread, + out_10d_global_desc, + p_out_global + + out_k_h_w_n_global_desc.GetOffsetFromMultiIndex( + k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin, + n_block_data_begin + n_thread_data_begin), + out_10d_thread_desc.GetLengths(), + Number{}); }); } }; diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r2_nchw_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r2_nchw_cyxk_khwn.hip.hpp index 510836566b..89a47b9e64 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r2_nchw_cyxk_khwn.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r2_nchw_cyxk_khwn.hip.hpp @@ -221,11 +221,11 @@ struct GridwiseConvolutionImplicitGemm_v1r2_nchw_cyxk_khwn #if 0 const Float* p_in_global_block_offset = - p_in_global + in_n_c_h_w_global_desc.Get1dIndex( + p_in_global + in_n_c_h_w_global_desc.GetOffsetFromMultiIndex( n_block_data_begin, 0, hi_block_data_begin, wi_block_data_begin); const Float* p_wei_global_block_offset = - p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin); + p_wei_global + wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, 0, 0, k_block_data_begin); for(index_t c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock, p_in_global_block_offset += CPerBlock * in_n_c_h_w_global_desc.GetStride(I1), @@ -234,20 +234,20 @@ struct GridwiseConvolutionImplicitGemm_v1r2_nchw_cyxk_khwn for(index_t y = 0; y < Y; ++y) { blockwise_in_copy_reorder.Run(p_in_global_block_offset + - in_n_c_h_w_global_desc.Get1dIndex(0, 0, y, 0), + in_n_c_h_w_global_desc.GetOffsetFromMultiIndex(0, 0, y, 0), p_in_block); blockwise_wei_copy.Run(p_wei_global_block_offset + - wei_c_y_x_k_global_desc.Get1dIndex(0, y, 0, 0), + wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, 0, 0), p_wei_block); __syncthreads(); for(index_t x = 0; x < X; ++x) { - blockwise_batch_gemm.Run(p_wei_block + wei_c_x_k_block_desc.Get1dIndex(0, x, 0), + blockwise_batch_gemm.Run(p_wei_block + wei_c_x_k_block_desc.GetOffsetFromMultiIndex(0, x, 0), p_in_block + - in_c_h_w_n_block_desc.Get1dIndex(0, 0, x, 0), + in_c_h_w_n_block_desc.GetOffsetFromMultiIndex(0, 0, x, 0), p_out_thread); } @@ -259,11 +259,12 @@ struct GridwiseConvolutionImplicitGemm_v1r2_nchw_cyxk_khwn { const Float* p_in_global_block_offset = p_in_global + - in_n_c_h_w_global_desc.Get1dIndex( + in_n_c_h_w_global_desc.GetOffsetFromMultiIndex( n_block_data_begin, 0, hi_block_data_begin + y, wi_block_data_begin); const Float* p_wei_global_block_offset = - p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, y, 0, k_block_data_begin); + p_wei_global + + wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, 0, k_block_data_begin); for(index_t c_block_data_begin = 0; @@ -287,10 +288,10 @@ struct GridwiseConvolutionImplicitGemm_v1r2_nchw_cyxk_khwn for(index_t x = 0; x < X; ++x) { - blockwise_batch_gemm.Run(p_wei_block + wei_c_x_k_block_desc.Get1dIndex(0, x, 0), - p_in_block + - in_c_h_w_n_block_desc.Get1dIndex(0, 0, x, 0), - p_out_thread); + blockwise_batch_gemm.Run( + p_wei_block + wei_c_x_k_block_desc.GetOffsetFromMultiIndex(0, x, 0), + p_in_block + in_c_h_w_n_block_desc.GetOffsetFromMultiIndex(0, 0, x, 0), + p_out_thread); } __syncthreads(); @@ -336,16 +337,16 @@ struct GridwiseConvolutionImplicitGemm_v1r2_nchw_cyxk_khwn } #endif - threadwise_10d_tensor_copy( - out_10d_thread_desc, - p_out_thread, - out_10d_global_desc, - p_out_global + - out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin, - n_block_data_begin + n_thread_data_begin), - out_10d_thread_desc.GetLengths(), - Number{}); + threadwise_10d_tensor_copy(out_10d_thread_desc, + p_out_thread, + out_10d_global_desc, + p_out_global + + out_k_h_w_n_global_desc.GetOffsetFromMultiIndex( + k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin, + n_block_data_begin + n_thread_data_begin), + out_10d_thread_desc.GetLengths(), + Number{}); } }; diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hip.hpp index 35b5d87a84..d6190a2c76 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hip.hpp @@ -82,7 +82,8 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn constexpr auto block_work_desc = make_ConstantTensorDescriptor( Sequence{}); - const auto block_work_multi_id = block_work_desc.GetMultiIndex(get_block_1d_id()); + const auto block_work_multi_id = + block_work_desc.GetMultiIndexFrom1dIndex(get_block_1d_id()); const index_t n_block_data_begin = block_work_multi_id[0] * NPerBlock; const index_t k_block_data_begin = block_work_multi_id[1] * KPerBlock; @@ -225,11 +226,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn #if 1 const Float* p_in_global_block_offset = p_in_global + - in_c_h_w_n_global_desc.Get1dIndex( + in_c_h_w_n_global_desc.GetOffsetFromMultiIndex( 0, hi_block_data_begin, wi_block_data_begin, n_block_data_begin); const Float* p_wei_global_block_offset = - p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin); + p_wei_global + + wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, 0, 0, k_block_data_begin); for(index_t c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock, p_in_global_block_offset += CPerBlock * in_c_h_w_n_global_desc.GetStride(I0), @@ -240,13 +242,15 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn #pragma unroll for(index_t x = 0; x < X; ++x) { - blockwise_in_copy.Run(p_in_global_block_offset + - in_c_h_w_n_global_desc.Get1dIndex(0, y, x, 0), - p_in_block); + blockwise_in_copy.Run( + p_in_global_block_offset + + in_c_h_w_n_global_desc.GetOffsetFromMultiIndex(0, y, x, 0), + p_in_block); - blockwise_wei_copy.Run(p_wei_global_block_offset + - wei_c_y_x_k_global_desc.Get1dIndex(0, y, x, 0), - p_wei_block); + blockwise_wei_copy.Run( + p_wei_global_block_offset + + wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, x, 0), + p_wei_block); __syncthreads(); @@ -263,11 +267,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn { const Float* p_in_global_block_offset = p_in_global + - in_c_h_w_n_global_desc.Get1dIndex( + in_c_h_w_n_global_desc.GetOffsetFromMultiIndex( 0, hi_block_data_begin + y, wi_block_data_begin + x, n_block_data_begin); const Float* p_wei_global_block_offset = - p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, y, x, k_block_data_begin); + p_wei_global + + wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, x, k_block_data_begin); for(index_t c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock, @@ -347,17 +352,17 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn } #endif - threadwise_tensor_slice_copy( - out_10d_thread_desc, - p_out_thread, - out_10d_global_desc, - p_out_global + - out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin, - n_block_data_begin + n_thread_data_begin), - out_10d_thread_desc.GetLengths(), - Number{}); + threadwise_tensor_slice_copy(out_10d_thread_desc, + p_out_thread, + out_10d_global_desc, + p_out_global + + out_k_h_w_n_global_desc.GetOffsetFromMultiIndex( + k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin, + n_block_data_begin + n_thread_data_begin), + out_10d_thread_desc.GetLengths(), + Number{}); }).else_([&](auto f_dummy) { static_assert(f_dummy(GemmNPerThreadSubC) >= NPerBlock && NPerThread == NPerBlock && GemmNPerThreadSubC % NPerThread == 0, @@ -397,17 +402,17 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn } #endif - threadwise_tensor_slice_copy( - out_10d_thread_desc, - p_out_thread, - out_10d_global_desc, - p_out_global + - out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin, - n_block_data_begin + n_thread_data_begin), - out_10d_thread_desc.GetLengths(), - Number{}); + threadwise_tensor_slice_copy(out_10d_thread_desc, + p_out_thread, + out_10d_global_desc, + p_out_global + + out_k_h_w_n_global_desc.GetOffsetFromMultiIndex( + k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin, + n_block_data_begin + n_thread_data_begin), + out_10d_thread_desc.GetLengths(), + Number{}); }); } }; diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp index c0a67837b6..a1cd646b7f 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp @@ -85,10 +85,11 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn constexpr index_t WBlockWork = mod_conv::integer_divide_ceil(Wo, WoPerBlock); constexpr index_t NBlockWork = mod_conv::integer_divide_ceil(N, NPerBlock); - constexpr auto block_work_desc = make_ConstantTensorDescriptor( + constexpr auto block_work_desc = make_packed_ConstantTensorDescriptor( Sequence{}); - const auto block_work_multi_id = block_work_desc.GetMultiIndex(get_block_1d_id()); + const auto block_work_multi_id = + block_work_desc.GetMultiIndexFrom1dIndex(get_block_1d_id()); const index_t k_block_data_begin = block_work_multi_id[0] * KPerBlock; const index_t ho_block_data_begin = block_work_multi_id[1] * HoPerBlock; @@ -108,7 +109,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn GemmDataPerReadA, GemmDataPerReadB); - constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_aligned( + constexpr auto in_c_h_w_n_block_desc = make_ranked_ConstantTensorDescriptor_with_alignment( Sequence{}, Number{}); @@ -117,12 +118,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn static_assert(in_c_h_w_n_block_desc.GetStride(I1) % GemmDataPerReadB == 0, "GemmDataPerReadB alignment requirement is not meet"); - constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned( + constexpr auto wei_c_k_block_desc = make_ranked_ConstantTensorDescriptor_with_alignment( Sequence{}, Number{}); // tensor view of threadwise output in register - constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor( + constexpr auto out_k_h_w_n_thread_desc = make_packed_ConstantTensorDescriptor( Sequence{}); // blockwise copy @@ -243,11 +244,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn { const Float* p_in_global_block_offset = p_in_global + - in_c_h_w_n_global_desc.Get1dIndex( + in_c_h_w_n_global_desc.GetOffsetFromMultiIndex( 0, hi_block_data_begin + y, wi_block_data_begin + x, n_block_data_begin); const Float* p_wei_global_block_offset = - p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, y, x, k_block_data_begin); + p_wei_global + + wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, x, k_block_data_begin); // LDS double buffer: preload data into LDS { @@ -399,17 +401,17 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn } #endif - threadwise_tensor_slice_copy( - out_10d_thread_desc, - p_out_thread, - out_10d_global_desc, - p_out_global + - out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin, - n_block_data_begin + n_thread_data_begin), - out_10d_thread_desc.GetLengths(), - Number{}); + threadwise_tensor_slice_copy(out_10d_thread_desc, + p_out_thread, + out_10d_global_desc, + p_out_global + + out_k_h_w_n_global_desc.GetOffsetFromMultiIndex( + k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin, + n_block_data_begin + n_thread_data_begin), + out_10d_thread_desc.GetLengths(), + Number{}); }).else_([&](auto fwd) { static_assert(fwd(GemmNPerThreadSubC) >= NPerBlock && NPerThread == NPerBlock && GemmNPerThreadSubC % NPerThread == 0, @@ -450,17 +452,17 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn } #endif - threadwise_tensor_slice_copy( - out_10d_thread_desc, - p_out_thread, - out_10d_global_desc, - p_out_global + - out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin, - n_block_data_begin + n_thread_data_begin), - out_10d_thread_desc.GetLengths(), - Number{}); + threadwise_tensor_slice_copy(out_10d_thread_desc, + p_out_thread, + out_10d_global_desc, + p_out_global + + out_k_h_w_n_global_desc.GetOffsetFromMultiIndex( + k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin, + n_block_data_begin + n_thread_data_begin), + out_10d_thread_desc.GetLengths(), + Number{}); }); } }; diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp index 45933c6bc2..7e1f08c42f 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp @@ -86,10 +86,11 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn constexpr index_t HBlockWork = mod_conv::integer_divide_ceil(Ho, HoPerBlock); constexpr index_t WBlockWork = mod_conv::integer_divide_ceil(Wo, WoPerBlock); - constexpr auto block_work_desc = make_ConstantTensorDescriptor( + constexpr auto block_work_desc = make_packed_ConstantTensorDescriptor( Sequence{}); - const auto block_work_multi_id = block_work_desc.GetMultiIndex(get_block_1d_id()); + const auto block_work_multi_id = + block_work_desc.GetMultiIndexFrom1dIndex(get_block_1d_id()); const index_t n_block_data_begin = block_work_multi_id[0] * NPerBlock; const index_t k_block_data_begin = block_work_multi_id[1] * KPerBlock; @@ -101,7 +102,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn // global tensor view constexpr auto wei_c_k_global_desc = - make_ConstantTensorDescriptor(Sequence{}, Sequence{}); + make_ranked_ConstantTensorDescriptor(Sequence{}, Sequence{}); // LDS tensor view // be careful of alignment @@ -110,7 +111,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn GemmDataPerReadA, GemmDataPerReadB); - constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_aligned( + constexpr auto in_c_h_w_n_block_desc = make_ranked_ConstantTensorDescriptor_with_alignment( Sequence{}, Number{}); @@ -119,12 +120,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn static_assert(in_c_h_w_n_block_desc.GetStride(I1) % GemmDataPerReadB == 0, "GemmDataPerReadB alignment requirement is not meet"); - constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned( + constexpr auto wei_c_k_block_desc = make_ranked_ConstantTensorDescriptor_with_alignment( Sequence{}, Number{}); // tensor view of threadwise output in register - constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor( + constexpr auto out_k_h_w_n_thread_desc = make_packed_ConstantTensorDescriptor( Sequence{}); // blockwise copy @@ -241,11 +242,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn { const Float* p_in_global_block_offset = p_in_global + - in_n_c_h_w_global_desc.Get1dIndex( + in_n_c_h_w_global_desc.GetOffsetFromMultiIndex( n_block_data_begin, 0, hi_block_data_begin + y, wi_block_data_begin + x); const Float* p_wei_global_block_offset = - p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, y, x, k_block_data_begin); + p_wei_global + + wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, x, k_block_data_begin); // LDS double buffer: preload data into LDS { @@ -359,13 +361,13 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn const index_t wo_thread_data_begin = c_thread_mtx_begin.col / NPerBlock; const index_t n_thread_data_begin = c_thread_mtx_begin.col % NPerBlock; - static_if{}([&](auto f_dummy) { // f_dummy do nothing but + static_if{}([&](auto fwd) { // fwd do nothing but // perfect forwarding. // Using this trick to // make this lambda a generic lambda, so it won't be compiled until // instantiated static_assert( - (f_dummy(GemmNPerThreadSubC) <= NPerBlock && NPerBlock % GemmNPerThreadSubC == 0), + (fwd(GemmNPerThreadSubC) <= NPerBlock && NPerBlock % GemmNPerThreadSubC == 0), "wrong!"); // output is a 10d tensor @@ -373,12 +375,13 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn constexpr index_t N1 = NPerBlock / N2; constexpr index_t W2 = - (GemmNLevel0Cluster * GemmNLevel1Cluster) / f_dummy(NPerBlock / GemmNPerThreadSubC); + (GemmNLevel0Cluster * GemmNLevel1Cluster) / fwd(NPerBlock / GemmNPerThreadSubC); constexpr index_t W1 = WoPerBlock / W2; constexpr index_t K2 = GemmMPerThreadSubC; constexpr index_t K1 = KPerBlock / KPerThread; +#if 0 constexpr auto out_10d_global_desc = make_ConstantTensorDescriptor(Sequence{}); constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor( Sequence{}); +#else + constexpr auto out_10d_global_desc = fwd(out_k_h_w_n_global_desc) + .Fold(I3, Number{}, Number{}) + .Fold(I2, Number{}, Number{}) + .Fold(I0, Number{}, Number{}); + + constexpr auto out_10d_thread_desc = fwd(out_k_h_w_n_thread_desc) + .Fold(I3, Number<1>{}, Number{}) + .Fold(I2, Number{}, Number<1>{}) + .Fold(I0, Number<1>{}, Number{}); +#endif #if 0 if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) @@ -407,19 +421,19 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn } #endif - threadwise_tensor_slice_copy( - out_10d_thread_desc, - p_out_thread, - out_10d_global_desc, - p_out_global + - out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin, - n_block_data_begin + n_thread_data_begin), - out_10d_thread_desc.GetLengths(), - Number{}); - }).else_([&](auto f_dummy) { - static_assert(f_dummy(GemmNPerThreadSubC) >= NPerBlock && NPerThread == NPerBlock && + threadwise_tensor_slice_copy(out_10d_thread_desc, + p_out_thread, + out_10d_global_desc, + p_out_global + + out_k_h_w_n_global_desc.GetOffsetFromMultiIndex( + k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin, + n_block_data_begin + n_thread_data_begin), + out_10d_thread_desc.GetLengths(), + Number{}); + }).else_([&](auto fwd) { + static_assert(fwd(GemmNPerThreadSubC) >= NPerBlock && NPerThread == NPerBlock && GemmNPerThreadSubC % NPerThread == 0, "wrong!"); @@ -428,16 +442,30 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn constexpr index_t W3 = GemmNPerThreadSubC / NPerBlock; constexpr index_t W2 = GemmNLevel0Cluster * GemmNLevel1Cluster; - constexpr index_t W1 = WoPerBlock / f_dummy(W2 * W3); + constexpr index_t W1 = WoPerBlock / fwd(W2 * W3); constexpr index_t K2 = GemmMPerThreadSubC; constexpr index_t K1 = KPerBlock / KPerThread; - constexpr auto out_10d_global_desc = make_ConstantTensorDescriptor( +#if 0 + constexpr auto out_10d_global_desc = make_packed_ConstantTensorDescriptor( Sequence{}); - constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor( + constexpr auto out_10d_thread_desc = make_packed_ConstantTensorDescriptor( Sequence{}); +#else + constexpr auto out_10d_global_desc = + fwd(out_k_h_w_n_global_desc) + .Fold(I3, Number{}) + .Fold(I2, Number{}, Number{}, Number{}) + .Fold(I0, Number{}, Number{}); + + constexpr auto out_10d_thread_desc = + fwd(out_k_h_w_n_thread_desc) + .Fold(I3, Number{}) + .Fold(I2, Number{}, Number<1>{}, Number{}) + .Fold(I0, Number<1>{}, Number{}); +#endif #if 0 if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) @@ -457,17 +485,17 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn } #endif - threadwise_tensor_slice_copy( - out_10d_thread_desc, - p_out_thread, - out_10d_global_desc, - p_out_global + - out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin, - n_block_data_begin + n_thread_data_begin), - out_10d_thread_desc.GetLengths(), - Number{}); + threadwise_tensor_slice_copy(out_10d_thread_desc, + p_out_thread, + out_10d_global_desc, + p_out_global + + out_k_h_w_n_global_desc.GetOffsetFromMultiIndex( + k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin, + n_block_data_begin + n_thread_data_begin), + out_10d_thread_desc.GetLengths(), + Number{}); }); } }; diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp index 2bfe348d0d..b8689b9257 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp @@ -86,10 +86,11 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw constexpr index_t HBlockWork = mod_conv::integer_divide_ceil(Ho, HoPerBlock); constexpr index_t WBlockWork = mod_conv::integer_divide_ceil(Wo, WoPerBlock); - constexpr auto block_work_desc = make_ConstantTensorDescriptor( + constexpr auto block_work_desc = make_packed_ConstantTensorDescriptor( Sequence{}); - const auto block_work_multi_id = block_work_desc.GetMultiIndex(get_block_1d_id()); + const auto block_work_multi_id = + block_work_desc.GetMultiIndexFrom1dIndex(get_block_1d_id()); const index_t n_block_data_begin = block_work_multi_id[0] * NPerBlock; const index_t k_block_data_begin = block_work_multi_id[1] * KPerBlock; @@ -109,7 +110,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw GemmDataPerReadA, GemmDataPerReadB); - constexpr auto in_c_h_w_n_block_desc = make_ConstantTensorDescriptor_aligned( + constexpr auto in_c_h_w_n_block_desc = make_ranked_ConstantTensorDescriptor_with_alignment( Sequence{}, Number{}); @@ -118,12 +119,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw static_assert(in_c_h_w_n_block_desc.GetStride(I1) % GemmDataPerReadB == 0, "GemmDataPerReadB alignment requirement is not meet"); - constexpr auto wei_c_k_block_desc = make_ConstantTensorDescriptor_aligned( + constexpr auto wei_c_k_block_desc = make_ranked_ConstantTensorDescriptor_with_alignment( Sequence{}, Number{}); // tensor view of threadwise output in register - constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor( + constexpr auto out_k_h_w_n_thread_desc = make_packed_ConstantTensorDescriptor( Sequence{}); // blockwise copy @@ -240,11 +241,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw { const Float* p_in_global_block_offset = p_in_global + - in_n_c_h_w_global_desc.Get1dIndex( + in_n_c_h_w_global_desc.GetOffsetFromMultiIndex( n_block_data_begin, 0, hi_block_data_begin + y, wi_block_data_begin + x); const Float* p_wei_global_block_offset = - p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, y, x, k_block_data_begin); + p_wei_global + + wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, x, k_block_data_begin); // LDS double buffer: preload data into LDS { @@ -407,10 +409,11 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw p_out_thread, out_10d_global_desc, p_out_global + - out_n_k_h_w_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin, - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin), + out_n_k_h_w_global_desc.GetOffsetFromMultiIndex( + n_block_data_begin + n_thread_data_begin, + k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin), out_10d_thread_desc.GetLengths(), map_out_global2thread); // Number{}); @@ -461,10 +464,11 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw p_out_thread, out_10d_global_desc, p_out_global + - out_n_k_h_w_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin, - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin), + out_n_k_h_w_global_desc.GetOffsetFromMultiIndex( + n_block_data_begin + n_thread_data_begin, + k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin), out_10d_thread_desc.GetLengths(), map_out_global2thread); // Number{}); diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hip.hpp index 3b3107ff70..8f058f83e9 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hip.hpp @@ -236,11 +236,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn #if 1 const Float* p_in_global_block_offset = p_in_global + - in_n_c_h_w_global_desc.Get1dIndex( + in_n_c_h_w_global_desc.GetOffsetFromMultiIndex( n_block_data_begin, 0, hi_block_data_begin, wi_block_data_begin); const Float* p_wei_global_block_offset = - p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin); + p_wei_global + + wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, 0, 0, k_block_data_begin); for(index_t c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock, p_in_global_block_offset += CPerBlock * in_n_c_h_w_global_desc.GetStride(I1), @@ -251,23 +252,27 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn for(index_t x = 0; x < X; ++x) { #if 1 - blockwise_in_copy_reorder.Run(p_in_global_block_offset + - in_n_c_h_w_global_desc.Get1dIndex(0, 0, y, x), - p_in_block); + blockwise_in_copy_reorder.Run( + p_in_global_block_offset + + in_n_c_h_w_global_desc.GetOffsetFromMultiIndex(0, 0, y, x), + p_in_block); - blockwise_wei_copy.Run(p_wei_global_block_offset + - wei_c_y_x_k_global_desc.Get1dIndex(0, y, x, 0), - p_wei_block); + blockwise_wei_copy.Run( + p_wei_global_block_offset + + wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, x, 0), + p_wei_block); #else Float p_in_clipboard[blockwise_in_copy_reorder.GetRegisterClipboardSize()]; Float p_wei_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()]; blockwise_in_copy_reorder.RunLoadRegisterClipboard( - p_in_global_block_offset + in_n_c_h_w_global_desc.Get1dIndex(0, 0, y, x), + p_in_global_block_offset + + in_n_c_h_w_global_desc.GetOffsetFromMultiIndex(0, 0, y, x), p_in_clipboard); blockwise_wei_copy.RunLoadRegisterClipboard( - p_wei_global_block_offset + wei_c_y_x_k_global_desc.Get1dIndex(0, y, x, 0), + p_wei_global_block_offset + + wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, x, 0), p_wei_clipboard); blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_clipboard, p_wei_block); @@ -291,11 +296,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn { const Float* p_in_global_block_offset = p_in_global + - in_n_c_h_w_global_desc.Get1dIndex( + in_n_c_h_w_global_desc.GetOffsetFromMultiIndex( n_block_data_begin, 0, hi_block_data_begin + y, wi_block_data_begin + x); const Float* p_wei_global_block_offset = - p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, y, x, k_block_data_begin); + p_wei_global + + wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, x, k_block_data_begin); for(index_t c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock, @@ -390,17 +396,17 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn } #endif - threadwise_tensor_slice_copy( - out_10d_thread_desc, - p_out_thread, - out_10d_global_desc, - p_out_global + - out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin, - n_block_data_begin + n_thread_data_begin), - out_10d_thread_desc.GetLengths(), - Number{}); + threadwise_tensor_slice_copy(out_10d_thread_desc, + p_out_thread, + out_10d_global_desc, + p_out_global + + out_k_h_w_n_global_desc.GetOffsetFromMultiIndex( + k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin, + n_block_data_begin + n_thread_data_begin), + out_10d_thread_desc.GetLengths(), + Number{}); }).else_([&](auto f_dummy) { static_assert(f_dummy(GemmNPerThreadSubC) >= NPerBlock && NPerThread == NPerBlock && GemmNPerThreadSubC % NPerThread == 0, @@ -440,17 +446,17 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn } #endif - threadwise_tensor_slice_copy( - out_10d_thread_desc, - p_out_thread, - out_10d_global_desc, - p_out_global + - out_k_h_w_n_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin, - n_block_data_begin + n_thread_data_begin), - out_10d_thread_desc.GetLengths(), - Number{}); + threadwise_tensor_slice_copy(out_10d_thread_desc, + p_out_thread, + out_10d_global_desc, + p_out_global + + out_k_h_w_n_global_desc.GetOffsetFromMultiIndex( + k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin, + n_block_data_begin + n_thread_data_begin), + out_10d_thread_desc.GetLengths(), + Number{}); }); } }; diff --git a/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp index ab66902f65..7c525b1c17 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp @@ -86,7 +86,8 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw constexpr auto block_work_desc = make_ConstantTensorDescriptor( Sequence{}); - const auto block_work_multi_id = block_work_desc.GetMultiIndex(get_block_1d_id()); + const auto block_work_multi_id = + block_work_desc.GetMultiIndexFrom1dIndex(get_block_1d_id()); const index_t n_block_data_begin = block_work_multi_id[0] * NPerBlock; const index_t k_block_data_begin = block_work_multi_id[1] * KPerBlock; @@ -234,11 +235,11 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw #if 0 const Float* p_in_global_block_offset = p_in_global + - in_n_c_h_w_global_desc.Get1dIndex( + in_n_c_h_w_global_desc.GetOffsetFromMultiIndex( n_block_data_begin, 0, hi_block_data_begin, wi_block_data_begin); const Float* p_wei_global_block_offset = - p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin); + p_wei_global + wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, 0, 0, k_block_data_begin); for(index_t c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock, p_in_global_block_offset += CPerBlock * in_n_c_h_w_global_desc.GetStride(I1), @@ -250,22 +251,22 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw { #if 1 blockwise_in_copy_reorder.Run(p_in_global_block_offset + - in_n_c_h_w_global_desc.Get1dIndex(0, 0, y, x), + in_n_c_h_w_global_desc.GetOffsetFromMultiIndex(0, 0, y, x), p_in_block); blockwise_wei_copy.Run(p_wei_global_block_offset + - wei_c_y_x_k_global_desc.Get1dIndex(0, y, x, 0), + wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, x, 0), p_wei_block); #else Float p_in_clipboard[blockwise_in_copy_reorder.GetRegisterClipboardSize()]; Float p_wei_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()]; blockwise_in_copy_reorder.RunLoadRegisterClipboard( - p_in_global_block_offset + in_n_c_h_w_global_desc.Get1dIndex(0, 0, y, x), + p_in_global_block_offset + in_n_c_h_w_global_desc.GetOffsetFromMultiIndex(0, 0, y, x), p_in_clipboard); blockwise_wei_copy.RunLoadRegisterClipboard( - p_wei_global_block_offset + wei_c_y_x_k_global_desc.Get1dIndex(0, y, x, 0), + p_wei_global_block_offset + wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, x, 0), p_wei_clipboard); blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_clipboard, p_wei_block); @@ -289,11 +290,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw { const Float* p_in_global_block_offset = p_in_global + - in_n_c_h_w_global_desc.Get1dIndex( + in_n_c_h_w_global_desc.GetOffsetFromMultiIndex( n_block_data_begin, 0, hi_block_data_begin + y, wi_block_data_begin + x); const Float* p_wei_global_block_offset = - p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, y, x, k_block_data_begin); + p_wei_global + + wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, x, k_block_data_begin); for(index_t c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock, @@ -395,10 +397,11 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw p_out_thread, out_10d_global_desc, p_out_global + - out_n_k_h_w_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin, - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin), + out_n_k_h_w_global_desc.GetOffsetFromMultiIndex( + n_block_data_begin + n_thread_data_begin, + k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin), out_10d_thread_desc.GetLengths(), map_out_global2thread); // Number{}); @@ -444,10 +447,11 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw p_out_thread, out_10d_global_desc, p_out_global + - out_n_k_h_w_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin, - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin), + out_n_k_h_w_global_desc.GetOffsetFromMultiIndex( + n_block_data_begin + n_thread_data_begin, + k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin), out_10d_thread_desc.GetLengths(), map_out_global2thread); // Number{}); diff --git a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp index eee1dd63ad..c78a16713b 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp @@ -193,10 +193,11 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn __shared__ Float p_wei_block[wei_block_space]; const Float* p_in_global_block_offset = - p_in_global + in_cb_global_desc.Get1dIndex(0, b_block_data_begin); + p_in_global + in_cb_global_desc.GetOffsetFromMultiIndex(0, b_block_data_begin); const Float* p_wei_global_block_offset = - p_wei_global + wei_cyxk_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin); + p_wei_global + + wei_cyxk_global_desc.GetOffsetFromMultiIndex(0, 0, 0, k_block_data_begin); // register Float p_out_thread[out_kb_thread_desc.GetElementSpace()]; @@ -236,7 +237,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn #elif 1 blockwise_gemm.Run_asm #endif - (p_wei_block + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0), + (p_wei_block + wei_cyxk_block_desc.GetOffsetFromMultiIndex(0, y, x, 0), p_in_block + y * Wi + x, p_out_thread); } @@ -267,8 +268,9 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn if(n_data < N && h_data < Ho && w_data < Wo) { - p_out_global[out_khwn_global_desc.Get1dIndex(k_data, h_data, w_data, n_data)] = - p_out_thread[out_kb_thread_desc.Get1dIndex(k, b)]; + p_out_global[out_khwn_global_desc.GetOffsetFromMultiIndex( + k_data, h_data, w_data, n_data)] = + p_out_thread[out_kb_thread_desc.GetOffsetFromMultiIndex(k, b)]; } } } diff --git a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp index fefed3a3e6..60baf437cf 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp @@ -198,10 +198,11 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer __shared__ Float p_wei_block_double[2 * wei_block_space]; const Float* p_in_global_block_offset = - p_in_global + in_cb_global_desc.Get1dIndex(0, b_block_data_begin); + p_in_global + in_cb_global_desc.GetOffsetFromMultiIndex(0, b_block_data_begin); const Float* p_wei_global_block_offset = - p_wei_global + wei_cyxk_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin); + p_wei_global + + wei_cyxk_global_desc.GetOffsetFromMultiIndex(0, 0, 0, k_block_data_begin); // preload data into LDS { @@ -269,7 +270,8 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer #elif 0 blockwise_gemm.Run_asm #endif - (p_wei_block_now + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0), + (p_wei_block_now + + wei_cyxk_block_desc.GetOffsetFromMultiIndex(0, y, x, 0), p_in_block_now + y * Wi + x, p_out_thread); } @@ -310,7 +312,8 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer #elif 0 blockwise_gemm.Run_asm #endif - (p_wei_block_double + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0), + (p_wei_block_double + + wei_cyxk_block_desc.GetOffsetFromMultiIndex(0, y, x, 0), p_in_block_double + y * Wi + x, p_out_thread); } @@ -336,7 +339,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer blockwise_gemm.Run_asm #endif (p_wei_block_double + wei_block_space + - wei_cyxk_block_desc.Get1dIndex(0, y, x, 0), + wei_cyxk_block_desc.GetOffsetFromMultiIndex(0, y, x, 0), p_in_block_double + in_block_space + y * Wi + x, p_out_thread); } @@ -365,14 +368,14 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer constexpr auto out_kb_global_desc = make_ConstantTensorDescriptor(Sequence{}); - threadwise_6d_tensor_copy( - out_6d_thread_desc, - p_out_thread, - out_6d_global_desc, - p_out_global + - out_kb_global_desc.Get1dIndex(k_thread_data_begin, b_thread_data_begin), - out_6d_thread_desc.GetLengths(), - Number{}); + threadwise_6d_tensor_copy(out_6d_thread_desc, + p_out_thread, + out_6d_global_desc, + p_out_global + + out_kb_global_desc.GetOffsetFromMultiIndex( + k_thread_data_begin, b_thread_data_begin), + out_6d_thread_desc.GetLengths(), + Number{}); } else { @@ -393,9 +396,9 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer if(n_data < N && h_data < Ho && w_data < Wo) { - p_out_global[out_khwn_global_desc.Get1dIndex( + p_out_global[out_khwn_global_desc.GetOffsetFromMultiIndex( k_data, h_data, w_data, n_data)] = - p_out_thread[out_kb_thread_desc.Get1dIndex(k, b)]; + p_out_thread[out_kb_thread_desc.GetOffsetFromMultiIndex(k, b)]; } } } diff --git a/src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp b/src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp index 0ff514e398..2af70e8314 100644 --- a/src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp +++ b/src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp @@ -83,7 +83,8 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw constexpr auto block_work_desc = make_ConstantTensorDescriptor(Sequence{}); - const auto block_work_multi_id = block_work_desc.GetMultiIndex(get_block_1d_id()); + const auto block_work_multi_id = + block_work_desc.GetMultiIndexFrom1dIndex(get_block_1d_id()); const index_t k_block_data_on_global = block_work_multi_id[0] * KPerBlock; const index_t b_block_data_on_global = block_work_multi_id[1] * BPerBlock; @@ -219,10 +220,10 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw { // calculate origin of block input and weight tensor on global memory const Float* p_in_block_on_global = - p_in_global + in_n_c_h_w_global_desc.Get1dIndex(0, 0, y, x); + p_in_global + in_n_c_h_w_global_desc.GetOffsetFromMultiIndex(0, 0, y, x); const Float* p_wei_block_on_global = - p_wei_global + wei_c_y_x_k_global_desc.Get1dIndex(0, y, x, 0); + p_wei_global + wei_c_y_x_k_global_desc.GetOffsetFromMultiIndex(0, y, x, 0); for(index_t c_block_data_on_global = 0; @@ -285,7 +286,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw // origin of thread tensor in global memory const index_t p_out_thread_on_global = p_out_global + - out_k_n1_b_n2_global_merged_desc.Get1dIndex( + out_k_n1_b_n2_global_merged_desc.GetOffsetFromMultiIndex( k_thread_data_on_global, 0, 0, 0); // dst origin on merged global tensor // copy diff --git a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp index 0b83eccc3c..6a73eb4b97 100644 --- a/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp +++ b/src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp @@ -190,18 +190,19 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( c_block_data_begin += CPerBlock, __syncthreads()) { // copy input tensor to LDS - blockwise_in_copy.Run(p_in_vec_global + - in_nchw_vec_global_desc.Get1dIndex(n_block_data_begin, - c_block_data_begin, - hi_block_data_begin, - wi_block_data_begin), - p_in_vec_block); + blockwise_in_copy.Run( + p_in_vec_global + + in_nchw_vec_global_desc.GetOffsetFromMultiIndex(n_block_data_begin, + c_block_data_begin, + hi_block_data_begin, + wi_block_data_begin), + p_in_vec_block); // copy weight tensor to LDS - blockwise_wei_copy.Run( - p_wei_vec_global + - wei_kcyx_vec_global_desc.Get1dIndex(k_block_data_begin, c_block_data_begin, 0, 0), - p_wei_vec_block); + blockwise_wei_copy.Run(p_wei_vec_global + + wei_kcyx_vec_global_desc.GetOffsetFromMultiIndex( + k_block_data_begin, c_block_data_begin, 0, 0), + p_wei_vec_block); __syncthreads(); @@ -212,26 +213,28 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( threadwise_direct_convolution_2( in_nchw_vec_thread_block_desc, p_in_vec_block + - in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin, - c_thread_data, - hi_thread_data_begin, - wi_thread_data_begin), + in_nchw_vec_block_desc.GetOffsetFromMultiIndex(n_thread_data_begin, + c_thread_data, + hi_thread_data_begin, + wi_thread_data_begin), wei_kcyx_vec_thread_block_desc, p_wei_vec_block + - wei_kcyx_vec_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), + wei_kcyx_vec_block_desc.GetOffsetFromMultiIndex( + k_thread_data_begin, c_thread_data, 0, 0), out_nkhw_thread_desc, p_out_thread); #elif 0 threadwise_direct_convolution_3( in_nchw_vec_thread_block_desc, p_in_vec_block + - in_nchw_vec_block_desc.Get1dIndex(n_thread_data_begin, - c_thread_data, - hi_thread_data_begin, - wi_thread_data_begin), + in_nchw_vec_block_desc.GetOffsetFromMultiIndex(n_thread_data_begin, + c_thread_data, + hi_thread_data_begin, + wi_thread_data_begin), wei_kcyx_vec_thread_block_desc, p_wei_vec_block + - wei_kcyx_vec_block_desc.Get1dIndex(k_thread_data_begin, c_thread_data, 0, 0), + wei_kcyx_vec_block_desc.GetOffsetFromMultiIndex( + k_thread_data_begin, c_thread_data, 0, 0), out_nkhw_thread_desc, p_out_thread); #endif @@ -239,14 +242,14 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw( } // copy output tensor from register to global mem - threadwise_4d_tensor_copy( - out_nkhw_thread_desc, - p_out_thread, - out_nkhw_global_desc, - p_out_global + - out_nkhw_global_desc.Get1dIndex(n_block_data_begin + n_thread_data_begin, - k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin), - out_nkhw_thread_desc.GetLengths()); + threadwise_4d_tensor_copy(out_nkhw_thread_desc, + p_out_thread, + out_nkhw_global_desc, + p_out_global + + out_nkhw_global_desc.GetOffsetFromMultiIndex( + n_block_data_begin + n_thread_data_begin, + k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin), + out_nkhw_thread_desc.GetLengths()); } diff --git a/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp b/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp index f04a283fcf..10eb0f8485 100644 --- a/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp +++ b/src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp @@ -217,7 +217,7 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded( threadwise_4d_tensor_set_zero(out_hkwn_thread_desc, p_out_thread); const Float* p_wei_global_block_begin = - p_wei_global + wei_ek_global_desc.Get1dIndex(0, k_block_data_begin); + p_wei_global + wei_ek_global_desc.GetOffsetFromMultiIndex(0, k_block_data_begin); for(index_t c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock, p_wei_global_block_begin += CPerBlock * wei_ek_global_desc.GetStride(I0), @@ -251,10 +251,11 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded( { auto f_accum = [](auto& acc, const auto&& v) { acc += v; }; - blockwise_batch_gemm.Run(p_wei_block + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0), - p_in_block + in_chwn_block_desc.Get1dIndex(0, y, x, 0), - p_out_thread, - f_accum); + blockwise_batch_gemm.Run( + p_wei_block + wei_cyxk_block_desc.GetOffsetFromMultiIndex(0, y, x, 0), + p_in_block + in_chwn_block_desc.GetOffsetFromMultiIndex(0, y, x, 0), + p_out_thread, + f_accum); } } } @@ -284,10 +285,10 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded( p_out_thread, out_khwn_global_desc, p_out_global + - out_khwn_global_desc.Get1dIndex(k_block_data_begin + k_thread_data_begin, - ho_block_data_begin + ho_thread_data_begin, - wo_block_data_begin + wo_thread_data_begin, - n_block_data_begin + n_thread_data_begin), + out_khwn_global_desc.GetOffsetFromMultiIndex(k_block_data_begin + k_thread_data_begin, + ho_block_data_begin + ho_thread_data_begin, + wo_block_data_begin + wo_thread_data_begin, + n_block_data_begin + n_thread_data_begin), out_hkwn_thread_desc.GetLengths(), reorder_khwn_from_hkwn); } diff --git a/src/include/tensor.hpp b/src/include/tensor.hpp index c91d132eec..e61ca1b432 100644 --- a/src/include/tensor.hpp +++ b/src/include/tensor.hpp @@ -93,7 +93,7 @@ struct TensorDescriptor const std::vector& GetStrides() const; template - std::size_t Get1dIndex(Is... is) const + std::size_t GetOffsetFromMultiIndex(Is... is) const { assert(sizeof...(Is) == this->GetNumOfDimension()); std::initializer_list iss{static_cast(is)...}; @@ -246,13 +246,13 @@ struct Tensor template T& operator()(Is... is) { - return mData[mDesc.Get1dIndex(is...)]; + return mData[mDesc.GetOffsetFromMultiIndex(is...)]; } template const T& operator()(Is... is) const { - return mData[mDesc.Get1dIndex(is...)]; + return mData[mDesc.GetOffsetFromMultiIndex(is...)]; } typename std::vector::iterator begin() { return mData.begin(); } diff --git a/src/include/threadwise_2d_tensor_op.hip.hpp b/src/include/threadwise_2d_tensor_op.hip.hpp index 24c6823f13..1bee7e801d 100644 --- a/src/include/threadwise_2d_tensor_op.hip.hpp +++ b/src/include/threadwise_2d_tensor_op.hip.hpp @@ -20,7 +20,7 @@ __device__ void threadwise_2d_tensor_pointwise_operation_unary(Desc, Float* __re { for(index_t did1 = 0; did1 < desc.GetLength(I1); ++did1) { - const index_t dindex = desc.Get1dIndex(did0, did1); + const index_t dindex = desc.GetOffsetFromMultiIndex(did0, did1); f(p[dindex]); } @@ -53,11 +53,11 @@ __device__ void threadwise_2d_tensor_pointwise_operation_binary_reorder_by_get_d { for(index_t did1 = 0; did1 < ref_desc.GetLength(I1); ++did1) { - const index_t aindex = src_desc.Get1dIndex(did0, did1); + const index_t aindex = src_desc.GetOffsetFromMultiIndex(did0, did1); const index_t did[2] = {did0, did1}; - const index_t bindex = dst_desc.Get1dIndex(did[IR0], did[IR1]); + const index_t bindex = dst_desc.GetOffsetFromMultiIndex(did[IR0], did[IR1]); f(p_src[aindex], p_dst[bindex]); } @@ -127,7 +127,7 @@ __device__ void threadwise_2d_tensor_shift_down(Desc, Float* __restrict__ p, IDi { for(index_t did1 = 0; did1 < did1_end; ++did1) { - const index_t dindex = desc.Get1dIndex(did0, did1); + const index_t dindex = desc.GetOffsetFromMultiIndex(did0, did1); const index_t sindex = dindex + nshift * desc.GetStride(IDim{}); diff --git a/src/include/threadwise_4d_tensor_op.hip.hpp b/src/include/threadwise_4d_tensor_op.hip.hpp index 7255650c1a..b8a2c59a26 100644 --- a/src/include/threadwise_4d_tensor_op.hip.hpp +++ b/src/include/threadwise_4d_tensor_op.hip.hpp @@ -26,7 +26,7 @@ __device__ void threadwise_4d_tensor_pointwise_operation_unary(Desc, Float* __re { for(index_t did3 = 0; did3 < desc.GetLength(I3); ++did3) { - const index_t dindex = desc.Get1dIndex(did0, did1, did2, did3); + const index_t dindex = desc.GetOffsetFromMultiIndex(did0, did1, did2, did3); f(p[dindex]); } @@ -75,12 +75,12 @@ __device__ void threadwise_4d_tensor_pointwise_operation_binary_reorder_given_ds { for(index_t did3 = 0; did3 < ref_desc.GetLength(I3); ++did3) { - const index_t aindex = src_desc.Get1dIndex(did0, did1, did2, did3); + const index_t aindex = src_desc.GetOffsetFromMultiIndex(did0, did1, did2, did3); const index_t did[4] = {did0, did1, did2, did3}; const index_t bindex = - dst_desc.Get1dIndex(did[IR0], did[IR1], did[IR2], did[IR3]); + dst_desc.GetOffsetFromMultiIndex(did[IR0], did[IR1], did[IR2], did[IR3]); f(p_src[aindex], p_dst[bindex]); @@ -178,7 +178,7 @@ __device__ void threadwise_4d_tensor_shift_down(Desc, Float* __restrict__ p, IDi { for(index_t did3 = 0; did3 < did3_end; ++did3) { - const index_t dindex = desc.Get1dIndex(did0, did1, did2, did3); + const index_t dindex = desc.GetOffsetFromMultiIndex(did0, did1, did2, did3); const index_t sindex = dindex + nshift * desc.GetStride(IDim{}); diff --git a/src/include/threadwise_direct_convolution.hip.hpp b/src/include/threadwise_direct_convolution.hip.hpp index 94c5f8ecd3..068ba52c41 100644 --- a/src/include/threadwise_direct_convolution.hip.hpp +++ b/src/include/threadwise_direct_convolution.hip.hpp @@ -46,11 +46,14 @@ __device__ void threadwise_direct_convolution_1(InDesc, const index_t hi = ho + y; const index_t wi = wo + x; - const index_t in_index = in_desc.Get1dIndex(n, c, hi, wi); + const index_t in_index = + in_desc.GetOffsetFromMultiIndex(n, c, hi, wi); - const index_t wei_index = wei_desc.Get1dIndex(k, c, y, x); + const index_t wei_index = + wei_desc.GetOffsetFromMultiIndex(k, c, y, x); - const index_t out_index = out_desc.Get1dIndex(n, k, ho, wo); + const index_t out_index = + out_desc.GetOffsetFromMultiIndex(n, k, ho, wo); fused_multiply_accumulate( p_out[out_index], p_wei[wei_index], p_in[in_index]); @@ -143,14 +146,14 @@ __device__ void threadwise_direct_convolution_3(InDesc, { // read first input threadwise_4d_tensor_copy(in_desc, - p_in + in_desc.Get1dIndex(0, 0, y, 0), + p_in + in_desc.GetOffsetFromMultiIndex(0, 0, y, 0), in_reg_desc, p_in_reg, in_reg_desc.GetLengths()); // read first 1x1 weight threadwise_4d_tensor_copy(wei_desc, - p_wei + wei_desc.Get1dIndex(0, 0, y, 0), + p_wei + wei_desc.GetOffsetFromMultiIndex(0, 0, y, 0), wei_reg_desc, p_wei_reg, wei_reg_desc.GetLengths()); @@ -164,7 +167,7 @@ __device__ void threadwise_direct_convolution_3(InDesc, { // read new weight threadwise_4d_tensor_copy(wei_desc, - p_wei + wei_desc.Get1dIndex(0, 0, y, x), + p_wei + wei_desc.GetOffsetFromMultiIndex(0, 0, y, x), wei_reg_desc, p_wei_reg, wei_reg_desc.GetLengths()); @@ -175,10 +178,10 @@ __device__ void threadwise_direct_convolution_3(InDesc, // read new input threadwise_4d_tensor_copy( in_desc, - p_in + in_desc.Get1dIndex(0, 0, y, x + in_reg_desc.GetLength(I3) - 1), + p_in + in_desc.GetOffsetFromMultiIndex(0, 0, y, x + in_reg_desc.GetLength(I3) - 1), in_reg_desc, p_in_reg + - in_reg_desc.Get1dIndex(0, 0, 0, in_reg_desc.GetLength(I3) - in_w_new_read), + in_reg_desc.GetOffsetFromMultiIndex(0, 0, 0, in_reg_desc.GetLength(I3) - in_w_new_read), in_desc_reg_new_read.GetLengths()); // do 1x1 conv @@ -196,14 +199,14 @@ __device__ void threadwise_direct_convolution_3(InDesc, { // read new weight threadwise_4d_tensor_copy(wei_desc, - p_wei + wei_desc.Get1dIndex(0, 0, y, x), + p_wei + wei_desc.GetOffsetFromMultiIndex(0, 0, y, x), wei_reg_desc, p_wei_reg, wei_reg_desc.GetLengths()); // read new input threadwise_4d_tensor_copy(in_desc, - p_in + in_desc.Get1dIndex(0, 0, y, x), + p_in + in_desc.GetOffsetFromMultiIndex(0, 0, y, x), in_reg_desc, p_in_reg, in_reg_desc.GetLengths()); diff --git a/src/include/threadwise_gemm.hip.hpp b/src/include/threadwise_gemm.hip.hpp index 6713b6ce63..5d82493e28 100644 --- a/src/include/threadwise_gemm.hip.hpp +++ b/src/include/threadwise_gemm.hip.hpp @@ -9,7 +9,7 @@ __device__ void threadwise_matrix_set_zero(Matrix, Float* __restrict__ p_thread) { for(index_t j = 0; j < Matrix::NCol(); ++j) { - const index_t id = Matrix::Get1dIndex(i, j); + const index_t id = Matrix::GetOffsetFromMultiIndex(i, j); p_thread[id] = 0; } } @@ -39,8 +39,8 @@ __device__ void threadwise_matrix_copy(SrcMatrix, { for(index_t j = 0; j < NCol; j += DataPerRead) { - const index_t src_index = src_mtx.Get1dIndex(i, j); - const index_t dst_index = dst_mtx.Get1dIndex(i, j); + const index_t src_index = src_mtx.GetOffsetFromMultiIndex(i, j); + const index_t dst_index = dst_mtx.GetOffsetFromMultiIndex(i, j); *reinterpret_cast(&p_dst[dst_index]) = *reinterpret_cast(&p_src[src_index]); @@ -83,9 +83,9 @@ __device__ void threadwise_gemm(MatrixA, { for(index_t j = 0; j < N; ++j) { - const index_t aindex = a_mtx.Get1dIndex(k, i); // A is transposed - const index_t bindex = b_mtx.Get1dIndex(k, j); - const index_t cindex = c_mtx.Get1dIndex(i, j); + const index_t aindex = a_mtx.GetOffsetFromMultiIndex(k, i); // A is transposed + const index_t bindex = b_mtx.GetOffsetFromMultiIndex(k, j); + const index_t cindex = c_mtx.GetOffsetFromMultiIndex(i, j); p_c_thread[cindex] += p_a_thread[aindex] * p_b_thread[bindex]; } diff --git a/src/include/threadwise_tensor_slice_op.hip.hpp b/src/include/threadwise_tensor_slice_op.hip.hpp index f2497e214c..6c441d289f 100644 --- a/src/include/threadwise_tensor_slice_op.hip.hpp +++ b/src/include/threadwise_tensor_slice_op.hip.hpp @@ -19,7 +19,7 @@ __device__ void threadwise_tensor_slice_copy(SrcDesc, constexpr auto src_desc = SrcDesc{}; constexpr auto dst_desc = DstDesc{}; - constexpr auto ref_desc = make_ConstantTensorDescriptor(SrcOpLengths{}); + constexpr auto ref_desc = make_packed_ConstantTensorDescriptor(SrcOpLengths{}); #if 0 if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0) @@ -53,9 +53,9 @@ __device__ void threadwise_tensor_slice_copy(SrcDesc, static_for<0, nRead, 1>{}([&](auto IRead) { constexpr auto multi_id = decltype(Ids){}.PushBack(Number{}); - const index_t src_index = src_desc.Get1dIndex(multi_id); + const index_t src_index = src_desc.GetOffsetFromMultiIndex(multi_id); - const index_t dst_index = dst_desc.Get1dIndex(multi_id); + const index_t dst_index = dst_desc.GetOffsetFromMultiIndex(multi_id); *(reinterpret_cast(&p_dst[dst_index])) = *(reinterpret_cast(&p_src[src_index])); @@ -84,9 +84,9 @@ threadwise_tensor_slice_copy_reorder_given_dst2src_v1(SrcDesc, ford{}([&](auto src_multi_id) { const auto dst_multi_id = reorder_array_given_new2old(src_multi_id, MapDst2Src{}); - const index_t dst_index = dst_desc.Get1dIndex(dst_multi_id); + const index_t dst_index = dst_desc.GetOffsetFromMultiIndex(dst_multi_id); - const index_t src_index = src_desc.Get1dIndex(src_multi_id); + const index_t src_index = src_desc.GetOffsetFromMultiIndex(src_multi_id); p_dst[dst_index] = p_src[src_index]; }); @@ -115,9 +115,9 @@ threadwise_tensor_slice_copy_reorder_given_dst2src_v2(SrcDesc, ford{}([&](auto dst_multi_id) { const auto src_multi_id = reorder_array_given_old2new(dst_multi_id, MapDst2Src{}); - const index_t dst_index = dst_desc.Get1dIndex(dst_multi_id); + const index_t dst_index = dst_desc.GetOffsetFromMultiIndex(dst_multi_id); - const index_t src_index = src_desc.Get1dIndex(src_multi_id); + const index_t src_index = src_desc.GetOffsetFromMultiIndex(src_multi_id); p_dst[dst_index] = p_src[src_index]; }); @@ -177,7 +177,7 @@ threadwise_tensor_slice_copy_reorder_given_dst2src_v3(SrcDesc, const auto src_multi_id = reorder_array_given_old2new(dst_multi_id, MapDst2Src{}); - const index_t src_index = src_desc.Get1dIndex(src_multi_id); + const index_t src_index = src_desc.GetOffsetFromMultiIndex(src_multi_id); vector_type::SetScalar( dst_vec_data, p_src[src_index], IDstData); @@ -186,7 +186,7 @@ threadwise_tensor_slice_copy_reorder_given_dst2src_v3(SrcDesc, // write data const auto dst_multi_id = ids.PushBack(IWrite.Get() * DstDataPerWrite); - const index_t dst_index = dst_desc.Get1dIndex(dst_multi_id); + const index_t dst_index = dst_desc.GetOffsetFromMultiIndex(dst_multi_id); *(reinterpret_cast(&p_dst[dst_index])) = dst_vec_data; }); @@ -204,5 +204,21 @@ threadwise_tensor_slice_copy_generic(SrcDesc, SliceLengths, DimAccessOrder) { - // not implemented + constexpr auto src_desc = SrcDesc{}; + constexpr auto dst_desc = DstDesc{}; + + constexpr auto slice_lengths_in_access_order = + SliceLengths{}.ReorderGivenNew2Old(DimAccessOrder{}); + + ford{}([&](auto data_multi_id_in_access_order) { + const auto data_multi_id = + reorder_array_given_old2new(data_multi_id_in_access_order, DimAccessOrder{}); + + const index_t dst_index = + dst_desc.GetOffsetFromMultiIndex(src_multi_offset + data_multi_id); + const index_t src_index = + src_desc.GetOffsetFromMultiIndex(dst_multi_offset + data_multi_id); + + p_dst[dst_index] = p_src[src_index]; + }); }